Exemple #1
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.deprecated.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #3
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank
            # this should be removed if we update BatchNorm stats
            #broadcast_buffers=False,find_unused_parameters=True
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, local_rank)

    return model
Exemple #4
0
def train(cfg, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    warmup_layers = tuple(x for x in cfg.SOLVER.WARMUP_LAYERS if len(x) != 0)
    warmup_iters = cfg.SOLVER.WARMUP_ITERS

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        warmup_layers,
        warmup_iters
    )
    return model
    def fit(self, train_data, test_data, classes):
        self.classes = classes
        optimizer = make_optimizer(self.cfg, self.model)
        scheduler = make_lr_scheduler(self.cfg, optimizer)

        arguments = {}
        arguments["iteration"] = 0

        self.checkpointer.classes = classes
        self.checkpointer.optimizer = optimizer
        self.checkpointer.scheduler = scheduler

        train_data_loader = make_data_loader(
            self.cfg,
            train_data,
            classes,
            is_train=True,
            is_distributed=False,
            start_iter=arguments["iteration"],
        )

        test_data_loader = make_data_loader(
            self.cfg,
            test_data,
            classes,
            is_train=True,
            is_distributed=False,
            start_iter=arguments["iteration"],
        )

        self.train_meter, self.test_meter = do_train(
            self.model,
            train_data_loader,
            test_data_loader,
            optimizer,
            scheduler,
            self.device,
            arguments,
        )
def train(cfg, local_rank, distributed, random_number_generator=None):
    if (torch._C, '_jit_set_profiling_executor') :
        torch._C._jit_set_profiling_executor(False)
    if (torch._C, '_jit_set_profiling_mode') :
        torch._C._jit_set_profiling_mode(False)

    # Model logging
    log_event(key=constants.GLOBAL_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH)
    log_event(key=constants.NUM_IMAGE_CANDIDATES, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # Initialize mixed-precision training
    is_fp16 = (cfg.DTYPE == "float16")
    if is_fp16:
        # convert model to FP16
        model.half()

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    log_event(key=constants.OPT_NAME, value="sgd_with_momentum")
    log_event(key=constants.OPT_BASE_LR, value=cfg.SOLVER.BASE_LR)
    log_event(key=constants.OPT_LR_WARMUP_STEPS, value=cfg.SOLVER.WARMUP_ITERS)
    log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=cfg.SOLVER.WARMUP_FACTOR)
    log_event(key=constants.OPT_LR_DECAY_FACTOR, value=cfg.SOLVER.GAMMA)
    log_event(key=constants.OPT_LR_DECAY_STEPS, value=cfg.SOLVER.STEPS)
    log_event(key=constants.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN[0])
    log_event(key=constants.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)

    scheduler = make_lr_scheduler(cfg, optimizer)

    # disable the garbage collection
    gc.disable()

    if distributed:
        model = DDP(model, delay_allreduce=True)

    arguments = {}
    arguments["iteration"] = 0
    arguments["nhwc"] = cfg.NHWC
    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS
    
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC)
    arguments.update(extra_checkpoint_data)
    
    if is_fp16:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)

    log_end(key=constants.INIT_STOP)
    barrier()
    log_start(key=constants.RUN_START)
    barrier()

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
        random_number_generator=random_number_generator,
    )
    log_event(key=constants.TRAIN_SAMPLES, value=len(data_loader))

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
                mlperf_test_early_exit,
                iters_per_epoch=iters_per_epoch,
                tester=functools.partial(test, cfg=cfg),
                model=model,
                distributed=distributed,
                min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
                min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    else:
        per_iter_callback_fn = None

    start_train_time = time.time()

    success = do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        cfg.DISABLE_REDUCED_LOGGING,
        per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print(
            "&&&& MLPERF METRIC THROUGHPUT={:.4f} iterations / s".format((arguments["iteration"] * cfg.SOLVER.IMS_PER_BATCH) / total_training_time)
    )

    return model, success
Exemple #7
0
def train(cfg, local_rank, distributed, ft_flag):
    # 导入模型
    model = build_detection_model(cfg)

    # 获取实验跑的设备
    device = torch.device(cfg.MODEL.DEVICE)
    # print('device', device)
    model.to(device)

    # 获取
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0

    #if ft_flag:
    #    scheduler = None

    if ft_flag:
        checkpointer = DetectronCheckpointer(
            cfg, model, optimizer, None, output_dir, save_to_disk
        )
    else:
        checkpointer = DetectronCheckpointer(
            cfg, model, optimizer, scheduler, output_dir, save_to_disk
        )

    # print('cfg.MODEL.WEIGHTP:', cfg.MODEL.WEIGHT)

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)
    # print('extra_checkpoint_data:', extra_checkpoint_data.state_dict())
    # arguments["iteration"] = 50000

    arguments["iteration"] = 0 if ft_flag else arguments["iteration"]

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #8
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, None, None, output_dir,
                                         save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    # arguments.update(extra_checkpoint_data)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    if cfg.MODEL.META_ARCHITECTURE == 'AdaptionRCNN':
        logger.info('AdaptionRCNN trainer is adapted!')
        cross_do_train(
            cfg,
            model,
            optimizer,
            scheduler,
            checkpointer,
            device,
            checkpoint_period,
            arguments,
            distributed,
        )
    elif cfg.MODEL.META_ARCHITECTURE == 'GeneralizedRCNN':
        logger.info('GeneralizedRCNN trainer is adapted!')
        data_loader = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
        )
        do_train(
            cfg,
            model,
            data_loader,
            optimizer,
            scheduler,
            checkpointer,
            device,
            checkpoint_period,
            arguments,
            distributed,
        )

    return model
def train(total_cfg, local_rank, distributed):
    total_model = []
    for i in reversed(range(len(total_cfg))):
        model = build_detection_model(total_cfg[i])
        device = torch.device(total_cfg[i].MODEL.DEVICE)
        model.to(device)
        if total_cfg[i].MODEL.USE_SYNCBN:
            assert is_pytorch_1_1_0_or_later(), \
                "SyncBatchNorm is only available in pytorch >= 1.1.0"
            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

        optimizer = make_optimizer(total_cfg[i], model)
        scheduler = make_lr_scheduler(total_cfg[i], optimizer)

        if distributed:                                                     
            model = torch.nn.parallel.DistributedDataParallel(              
                model, device_ids=[local_rank], output_device=local_rank,   
                # this should be removed if we update BatchNorm stats       
                broadcast_buffers=False, )

        arguments = {}
        arguments["iteration"] = 0

        output_dir = total_cfg[i].OUTPUT_DIR

        save_to_disk = get_rank() == 0
        checkpointer = DetectronCheckpointer(
            total_cfg[i], model, optimizer, scheduler, output_dir, save_to_disk
        )
        extra_checkpoint_data = checkpointer.load(total_cfg[i].MODEL.WEIGHT)
        if i == 0:
            arguments.update(extra_checkpoint_data)
        total_model.append(model)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = total_cfg[0].SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(total_cfg[0], is_train=False, is_distributed=distributed, is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = total_cfg[0].SOLVER.CHECKPOINT_PERIOD
    if len(total_model)>1:
        params = sum([np.prod(p.size()) for p in total_model[1].parameters()])
        print('Number of Parameters:{:5f}M'.format(params / 1e6))
        params = sum([np.prod(p.size()) for p in total_model[0].parameters()])
        print('teacher_model Number of Parameters:{:5f}M'.format(params / 1e6))
    else:
        params = sum([np.prod(p.size()) for p in total_model[0].parameters()])
        print('Number of Parameters:{:5f}M'.format(params / 1e6))

    do_train(
        total_cfg,
        total_model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
        args,
    )

    return total_model[1]
Exemple #10
0
def train(cfg, local_rank, distributed):
    # 生成模型
    model = build_detection_model(cfg)
    # _C.MODEL.DEVICE = "cuda"
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # 调节lr,用上了预热学习率
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    """
        opt_level有四个值选择输入:"O0", "O1", "O2", "03"
        00相当于原始的单精度训练。
        01在大部分计算时采用半精度,但是所有的模型参数依然保持单精度,对于少数单精度较好的计算(如softmax)依然保持单精度。
        02相比于01,将模型参数也变为半精度。
        03基本等于最开始实验的全半精度的运算。
        值得一提的是,不论在优化过程中,模型是否采用半精度,保存下来的模型均为单精度模型,能够保证模型在其他应用中的正常使用。
        """
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    # 多GPU运算
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    # 加载checkpointer文件
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    # 对datasets做一些操作,返回data_loaders
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None
    # _C.SOLVER.CHECKPOINT_PERIOD = 2500
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
    )

    return model
def train(cfg, cfg_origial, local_rank, distributed):
    ## The one with modified number of classes
    model = build_detection_model(cfg)

    # cfg_origial = cfg.clone()
    # cfg_origial.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
    # original_model = build_detection_model(cfg_origial)     ## Original model with 81 classes

    # ## Let's load weights for old class!
    # save_dir = cfg.OUTPUT_DIR
    # checkpointer = DetectronCheckpointer(cfg_origial, original_model, save_dir=save_dir)
    # checkpointer.load(cfg_origial.MODEL.WEIGHT)

    # # pretrained_model_pth = "/network/home/bhattdha/.torch/models/_detectron_35861795_12_2017_baselines_e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT_output_train_coco_2014_train%3Acoco_2014_valminusminival_generalized_rcnn_model_final.pkl"
    # # These keys are to be removed which forms final layers of the network
    # removal_keys = ['roi_heads.box.predictor.cls_score.weight', 'roi_heads.box.predictor.cls_score.bias', 'roi_heads.box.predictor.bbox_pred.weight', 'roi_heads.box.predictor.bbox_pred.bias', 'roi_heads.mask.predictor.mask_fcn_logits.weight', 'roi_heads.mask.predictor.mask_fcn_logits.bias']

    # model = _transfer_pretrained_weights(new_model, original_model, removal_keys)

    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # # Initialize mixed-precision training
    # use_mixed_precision = cfg.DTYPE == "float16"
    # amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    # model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.OUTPUT_DIR
    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)

    # cfg.MODEL.WEIGHT = '/network/home/bhattdha/exp.pth' ## Model stored through surgery
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #12
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0
    arguments['phase'] = 1
    arguments['plot_median'], arguments['plot_global_avg'] = defaultdict(
        list), defaultdict(list)

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    if arguments['phase'] == 1:
        data_loader = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
            phase=1,
        )
        do_train(
            cfg,
            model,
            data_loader,
            data_loader_val,
            optimizer,
            scheduler,
            checkpointer,
            device,
            checkpoint_period,
            test_period,
            arguments,
            training_phase=1,
        )
        arguments["iteration"] = 0
        arguments["phase"] = 2

    data_loader_phase2 = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
        phase=2,
    )

    do_train(
        cfg,
        model,
        data_loader_phase2,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
        training_phase=2,
    )

    return model
def train(cfg, local_rank, distributed, resume, config_file):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR
    out_split = output_dir.split("/")
    tensorboard_dir = os.path.join("tensorboard", out_split[-1] if len(out_split[-1]) > 0 else out_split[-2])
    print("tensorboard_dir:", tensorboard_dir)
    if not os.path.isdir(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    else:
        if len(os.listdir(tensorboard_dir)) > 0:
            print("Remove previous tensorboard events...")
            os.system("rm " + tensorboard_dir + "/*")
    result_writer = SummaryWriter(tensorboard_dir)

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    # print("resume:", resume)
    extra_checkpoint_data = checkpointer.load(resume == "True", cfg.MODEL.WEIGHT)

    arguments.update(extra_checkpoint_data)
    if resume == "False":
        arguments["iteration"] = 0
    # print("arguments:", arguments.keys())

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        results_recorder=result_writer,
        config_file=config_file
    )

    return model
Exemple #14
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    if cfg.MODEL.USE_SYNCBN:
        assert is_pytorch_1_1_0_or_later(), \
            "SyncBatchNorm is only available in pytorch >= 1.1.0"
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
    )

    return model
Exemple #15
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    logger = logging.getLogger("maskrcnn_benchmark.train")
    logger.info("The train model: \n {}".format(model))
    device = torch.device(cfg.MODEL.DEVICE)
    if cfg.SOLVER.USE_SYNC_BN:
        model = apex.parallel.convert_syncbn_model(model)
    model.to(device)
    optimizer = make_optimizer(cfg, model)
    model, optimizer = amp.initialize(model, optimizer, opt_level="O0")
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        # model = torch.nn.parallel.DistributedDataParallel(
        #      model, device_ids=[local_rank], output_device=local_rank,
        #      # this should be removed if we update BatchNorm stats
        #      #broadcast_buffers=False,
        # )
        model = DDP(model, delay_allreduce=True)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        mode=0,
        resolution=None,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    data_loader.collate_fn.special_deal = False

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    writer, arch_writer = setup_writer(output_dir, get_rank())

    if arch_writer is not None:
        arch_writer.write('Genotype: {}\n'.format(cfg.SEARCH.DECODER.CONFIG))
        arch_writer.close()

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        writer,
    )

    return model
def train(cfg, local_rank, distributed):
    # Model logging
    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE,
                 value=cfg.SOLVER.IMS_PER_BATCH)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION,
                 value=cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD,
                 value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS,
                 value=cfg.MODEL.RPN.ASPECT_RATIOS)
    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)
    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY,
                 value=cfg.SOLVER.WEIGHT_DECAY)

    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=distributed,
            min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
            min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    else:
        per_iter_callback_fn = None

    start_train_time = time.time()

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        per_iter_start_callback_fn=functools.partial(
            mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print("&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format(
        (arguments["iteration"] * 1.0) / total_training_time))

    return model
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)  # 梦开始的地方
    device = torch.device(cfg.MODEL.DEVICE)  # !!!!!
    model.to(device)

    for name, value in model.backbone.body.network.named_children(
    ):  # 冻结主干网络参数
        if int(name) > 60:
            for param in value.parameters():
                param.requires_grad = False

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"  # 这里可以改成float16来加速
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
    )

    return model
Exemple #18
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

#     if use_amp:
#         # Initialize mixed-precision training
#         use_mixed_precision = cfg.DTYPE == "float16"
#         amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)

#         # wrap the optimizer for mixed precision
#         if cfg.SOLVER.ACCUMULATE_GRAD:
#             # also specify number of steps to accumulate over
#             optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
#         else:
#             optimizer = amp_handle.wrap_optimizer(optimizer)


    model, optimizer = amp.initialize(model, optimizer,opt_level='O1')
    if distributed:
        if use_apex_ddp:
            model = DDP(model, delay_allreduce=True)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if 1==1:
        
        per_iter_callback_fn = functools.partial(
                mlperf_test_early_exit,
                iters_per_epoch=iters_per_epoch,
                tester=functools.partial(test, cfg=cfg),
                model=model,
                distributed=distributed,
                min_bbox_map=cfg.MIN_BBOX_MAP,
                min_segm_map=cfg.MIN_MASK_MAP)
    else:
        per_iter_callback_fn = None

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        use_amp,
        cfg,
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    return model
def train(cfg, local_rank, distributed, use_tensorboard=False, logger=None):
    arguments = {"iteration": 0}
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    if cfg.SOLVER.UNFREEZE_CONV_BODY:
        for p in model.backbone.parameters():
            p.requires_grad = True

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg,
                                         model,
                                         optimizer,
                                         scheduler,
                                         output_dir,
                                         save_to_disk,
                                         logger=logger)
    print(cfg.TRAIN.IGNORE_LIST)
    extra_checkpoint_data = checkpointer.load(
        cfg.MODEL.WEIGHT, ignore_list=cfg.TRAIN.IGNORE_LIST)
    arguments.update(extra_checkpoint_data)

    if cfg.SOLVER.KEEP_LR:
        optimizer = make_optimizer(cfg, model)
        scheduler = make_lr_scheduler(cfg, optimizer)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    tensorboard_logdir = cfg.OUTPUT_DIR
    tensorboard_exp_name = cfg.TENSORBOARD_EXP_NAME
    snapshot = cfg.SOLVER.SNAPSHOT_ITERS

    do_train(model,
             data_loader,
             optimizer,
             scheduler,
             checkpointer,
             device,
             checkpoint_period,
             arguments,
             snapshot,
             tensorboard_logdir,
             tensorboard_exp_name,
             use_tensorboard=use_tensorboard)

    return model
Exemple #20
0
def train(cfg,
          random_number_generator,
          local_rank,
          distributed,
          args,
          fp16=False):

    data_loader = make_data_loader(cfg,
                                   is_train=True,
                                   is_distributed=distributed)

    # todo sharath - undocument log below after package is updated
    # print_mlperf(key=mlperf_log.INPUT_SIZE, value=len(data_loader.dataset))

    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE,
                 value=cfg.DATALOADER.IMAGES_PER_BATCH_TRAIN)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST,
                 value=cfg.DATALOADER.IMAGES_PER_BATCH_TEST)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION,
                 value=cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD,
                 value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD,
                 value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS,
                 value=cfg.MODEL.RPN.ASPECT_RATIOS)

    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)

    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)

    model = build_detection_model(cfg)
    load_from_pretrained_checkpoint(cfg, model)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY,
                 value=cfg.SOLVER.WEIGHT_DECAY)

    scheduler = make_lr_scheduler(cfg, optimizer)
    max_iter = cfg.SOLVER.MAX_ITER

    if use_apex_amp:
        amp_handle = amp.init(enabled=fp16, verbose=False)
        if cfg.SOLVER.ACCUMULATE_GRAD:
            # also specify number of steps to accumulate over
            optimizer = amp_handle.wrap_optimizer(
                optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
        else:
            optimizer = amp_handle.wrap_optimizer(optimizer)

    if distributed:
        if use_apex_ddp:
            model = DDP(model, delay_allreduce=True)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)

    arguments = {}
    arguments["iteration"] = 0

    arguments["use_amp"] = use_apex_amp

    output_dir = cfg.OUTPUT_DIR

    if cfg.SAVE_CHECKPOINTS:
        checkpoint_file = cfg.CHECKPOINT
        checkpointer = Checkpoint(model, optimizer, scheduler, output_dir,
                                  local_rank)
        if checkpoint_file:
            extra_checkpoint_data = checkpointer.load(checkpoint_file)
            arguments.update(extra_checkpoint_data)
    else:
        checkpointer = None

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        max_iter,
        device,
        distributed,
        arguments,
        cfg,
        args,
        random_number_generator,
    )

    return model
Exemple #21
0
def train(cfg,
          local_rank,
          distributed,
          model_config=None,
          use_tensorboard=True):
    model = build_detection_model(cfg, model_config)
    if get_rank() == 0:
        if 'search' in cfg.MODEL.BACKBONE.CONV_BODY:
            print('backbone search space:', blocks_key)
        else:
            print('backbone:', cfg.MODEL.BACKBONE)
        if 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH:
            print('head search space:', head_ss_keys)
        else:
            print('head:', cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR,
                  cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH)
        if 'search' in cfg.MODEL.INTER_MODULE.NAME:
            print('inter search space:', inter_ss_keys)
        else:
            print('inter:', cfg.MODEL.INTER_MODULE.NAME)
        print(model)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer, lr_dict = make_optimizer(cfg, model)
    if get_rank() == 0:
        for item in lr_dict:
            print(item)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    if not ('search' in cfg.MODEL.BACKBONE.CONV_BODY
            or 'search' in cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR
            or 'search' in cfg.MODEL.SEG_BRANCH.SEGMENT_BRANCH):
        use_mixed_precision = cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True)

    # if 'search' in cfg.MODEL.BACKBONE.CONV_BODY:
    #     def forward_hook(module: Module, inp: (Tensor,)):
    #         if module.weight is not None:
    #             module.weight.requires_grad = True
    #         if module.bias is not None:
    #             module.bias.requires_grad = True

    #     all_modules = (nn.Conv2d, nn.Linear, nn.BatchNorm2d, nn.GroupNorm, ) # group norm更新!!
    #     for m in model.modules():
    #         if isinstance(m, all_modules):
    #             m.register_forward_pre_hook(forward_hook)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed,
                                           is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    if use_tensorboard:
        meters = TensorboardLogger(cfg=cfg,
                                   log_dir=cfg.TENSORBOARD_EXPERIMENT,
                                   start_iter=arguments['iteration'],
                                   delimiter="  ")
    else:
        meters = MetricLogger(delimiter="  ")

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
        meters,
    )

    return model
def train(cfg, local_rank, distributed, use_tensorboard=False):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    test_period = cfg.SOLVER.TEST_PERIOD
    if test_period > 0:
        data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
    else:
        data_loader_val = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    
    if use_tensorboard:
        meters = TensorboardLogger(
            log_dir=cfg.TENSORBOARD_EXPERIMENT,
            stage = 'train',
            start_iter=arguments['iteration'],
            delimiter="  ")
        meters_val = TensorboardLogger(
            log_dir=cfg.TENSORBOARD_EXPERIMENT,
            stage = 'val',
            start_iter=arguments['iteration'],
            delimiter="  ")
    else:
        meters = MetricLogger(delimiter="  ")
        meters_val = MetricLogger(delimiter="  ")

    do_train(
        cfg,
        model,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        test_period,
        arguments,
        meters,
        meters_val,
    )

    return model
Exemple #23
0
def train(cfg, local_rank, distributed):
    # Model logging
    print_mlperf(key=mlperf_log.INPUT_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH)
    print_mlperf(key=mlperf_log.BATCH_SIZE_TEST, value=cfg.TEST.IMS_PER_BATCH)

    print_mlperf(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value = cfg.INPUT.PIXEL_MEAN)
    print_mlperf(key=mlperf_log.INPUT_NORMALIZATION_STD, value=cfg.INPUT.PIXEL_STD)
    print_mlperf(key=mlperf_log.INPUT_RESIZE)
    print_mlperf(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING)
    print_mlperf(key=mlperf_log.MIN_IMAGE_SIZE, value=cfg.INPUT.MIN_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.MAX_IMAGE_SIZE, value=cfg.INPUT.MAX_SIZE_TRAIN)
    print_mlperf(key=mlperf_log.INPUT_RANDOM_FLIP)
    print_mlperf(key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
    print_mlperf(key=mlperf_log.FG_IOU_THRESHOLD, value=cfg.MODEL.RPN.FG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.BG_IOU_THRESHOLD, value=cfg.MODEL.RPN.BG_IOU_THRESHOLD)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_PRE_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TRAIN, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)
    print_mlperf(key=mlperf_log.RPN_POST_NMS_TOP_N_TEST, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST)
    print_mlperf(key=mlperf_log.ASPECT_RATIOS, value=cfg.MODEL.RPN.ASPECT_RATIOS)
    print_mlperf(key=mlperf_log.BACKBONE, value=cfg.MODEL.BACKBONE.CONV_BODY)
    print_mlperf(key=mlperf_log.NMS_THRESHOLD, value=cfg.MODEL.RPN.NMS_THRESH)
    # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/modeling/detector/detectors.py
    # building bare mode without doing anthing
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    print_mlperf(key=mlperf_log.OPT_NAME, value=mlperf_log.SGD_WITH_MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_LR, value=cfg.SOLVER.BASE_LR)
    print_mlperf(key=mlperf_log.OPT_MOMENTUM, value=cfg.SOLVER.MOMENTUM)
    print_mlperf(key=mlperf_log.OPT_WEIGHT_DECAY, value=cfg.SOLVER.WEIGHT_DECAY)


    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR
    print("output_dir "+str(output_dir))

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    # no such SAVE_CHECKPOINTS
    #arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS
    arguments["save_checkpoints"] = False

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"]
    )
    print("SSY iters_per_epoch "+str(iters_per_epoch))
    #print("SSY iters_per_epoch change to 100 ")
    #iters_per_epoch = 100

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    # SSY
    # I already add PER_EPOCH_EVAL and MIN_BBOX_MAP MIN_SEGM_MAP to  ./configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
    # but it still can not find it
    # so I manually set them here
    #if cfg.PER_EPOCH_EVAL:
    #    per_iter_callback_fn = functools.partial(
    #            mlperf_test_early_exit,
    #            iters_per_epoch=iters_per_epoch,
    #            tester=functools.partial(test, cfg=cfg),
    #            model=model,
    #            distributed=distributed,
    #            min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
    #            min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    #else:
    #    per_iter_callback_fn = None
    per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/tester.py
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=distributed,
            min_bbox_map=0.377,
            min_segm_map=0.339)

    start_train_time = time.time()
    # /root/ssy/ssynew/maskrcnn-benchmark/maskrcnn_benchmark/engine/trainer.py
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print(
            "&&&& MLPERF METRIC THROUGHPUT per GPU={:.4f} iterations / s".format((arguments["iteration"] * 1.0) / total_training_time)
    )

    return model
Exemple #24
0
def train(cfg, local_rank, distributed, fp16, dllogger):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    use_amp = False
    if fp16:
        use_amp = True
    else:
        use_amp = cfg.DTYPE == "float16"

    if distributed:
        if use_apex_ddp:
            model = DDP(model, delay_allreduce=True)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                output_device=local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            tester=functools.partial(test, cfg=cfg, dllogger=dllogger),
            model=model,
            distributed=distributed,
            min_bbox_map=cfg.MIN_BBOX_MAP,
            min_segm_map=cfg.MIN_MASK_MAP)
    else:
        per_iter_callback_fn = None

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        use_amp,
        cfg,
        dllogger,
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    return model, iters_per_epoch
Exemple #25
0
def train(cfg, local_rank, distributed):
    # original = torch.load('/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/renderpy150000/model_0025000.pth')
    #
    # new = {"model": original["model"]}
    # torch.save(new, '/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/finetune/model_0000000.pth')

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    # if cfg.MODEL.DEPTH_ON == True:
    #     model_depth = build_detection_model(cfg)
    #     device = torch.device(cfg.MODEL.DEVICE)
    #     model_depth.to(device)
    #     optimizer_depth = make_optimizer(cfg, model_depth)
    #     scheduler_depth = make_lr_scheduler(cfg, optimizer_depth)
    #     model_depth, optimizer_depth = amp.initialize(model_depth, optimizer_depth, opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpointer = DetectronCheckpointer(cfg,
                                         model,
                                         optimizer,
                                         scheduler,
                                         output_dir,
                                         save_to_disk,
                                         logger=None,
                                         isrgb=True,
                                         isdepth=True)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    # extra_checkpoint_data = checkpointer.load('/home/zoey/nas/zoey/github/maskrcnn-benchmark/checkpoints/renderpy150000/model_0025000.pth')
    arguments.update(extra_checkpoint_data)
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments)

    return model
Exemple #26
0
def train(cfg, local_rank, distributed, test_while_training):

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # ipdb.set_trace()

    scheduler = make_lr_scheduler(cfg, optimizer)

    logger = logging.getLogger("train_main_script")

    arguments = {}
    arguments["iteration"] = 0
    arguments['start_save_ckpt'] = cfg.SOLVER.START_SAVE_CHECKPOINT

    ## define the output dir
    output_dir = cfg.OUTPUT_DIR
    checkpoint_output_dir = os.path.join(output_dir, 'checkpoints')
    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         checkpoint_output_dir, save_to_disk)

    arguments['instance_id'] = output_dir.split('/')[-1]

    if len(cfg.MODEL.USE_DET_PRETRAIN) > 0:
        checkpointer.load_weight_partially(cfg.MODEL.USE_DET_PRETRAIN)
    elif len(cfg.MODEL.WEIGHT) > 0:
        extra_checkpoint_data, ckpt_name = checkpointer.load(cfg.MODEL.WEIGHT)
        arguments.update(extra_checkpoint_data)

    # logger.info(str(model))

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            find_unused_parameters=True,
        )

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )
    if test_while_training:
        logger.info("test_while_training on ")
        val_data_loader = make_data_loader(cfg,
                                           is_train=False,
                                           is_distributed=distributed)
    else:
        logger.info("test_while_training off ")
        val_data_loader = None

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        distributed,
        val_data_loader,
    )

    return model
Exemple #27
0
def train(cfg, args, DatasetCatalog=None):
    if len(cfg.DATASETS.TRAIN) == 0 or not args.train:
        return None

    local_rank = args.local_rank
    distributed = args.distributed

    model = build_detection_model(cfg)

    # for key, value in model.named_parameters():
    #     print(key, value.requires_grad)

    if hasattr(args, 'train_last_layer'):
        if args.train_last_layer:
            listofkeys = [
                'cls_score.bias', 'cls_score.weight', 'bbox_pred.bias',
                'bbox_pred.weight', 'mask_fcn_logits.bias',
                'mask_fcn_logits.weight'
            ]
            for key, value in model.named_parameters():
                value.requires_grad = False
                for k in listofkeys:
                    if k in key:
                        value.requires_grad = True
            # for key, value in model.named_parameters():
            #     print(key, value.requires_grad)

    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    if cfg.MODEL.DEVICE == 'cuda':
        use_mixed_precision = cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    save_to_disk = get_rank() == 0

    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         cfg.OUTPUT_DIR, save_to_disk)

    extra_checkpoint_data = checkpointer.load(
        cfg.MODEL.WEIGHT,
        force_load_external_checkpoint=False,
        copy_weight_from_head_box=args.copy_weight_from_head_box)

    arguments = {}
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        args,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
        DatasetCatalog=DatasetCatalog,
    )

    if cfg.SOLVER.TEST_PERIOD > 0:
        data_loader_val = make_data_loader(
            cfg,
            args,
            is_train=False,
            is_distributed=distributed,
            is_for_period=True,
            start_iter=arguments["iteration"],
            DatasetCatalog=DatasetCatalog,
        )
    else:
        data_loader_val = None

    do_train(
        model,
        cfg,
        data_loader,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        cfg.SOLVER.CHECKPOINT_PERIOD,
        cfg.SOLVER.TEST_PERIOD,
        arguments,
        cfg.OUTPUT_DIR,
        args.visualize_loss,
        args.vis_title,
        args.iters_per_epoch,
    )

    return model
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # load a reid model
    reid_model = build_reid_model(cfg)
    reid_model.to(device)
    print('#######loading from {}#######'.format(cfg.REID.TEST.WEIGHT))
    f = torch.load(
        cfg.REID.TEST.WEIGHT,
        map_location=torch.device("cpu"),
    )
    if 'model' in f:
        load_state_dict(reid_model, f['model'])
    else:
        reid_model.load_state_dict(f, strict=False)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # model, optimizer = amp.initialize(model, optimizer,
    #                                   opt_level="O0"
    #                                   )

    if distributed:
        model = DDP(model, delay_allreduce=True)
        # model = torch.nn.parallel.DistributedDataParallel(
        #     model, device_ids=[local_rank], output_device=local_rank,
        #     # this should be removed if we update BatchNorm stats
        #     broadcast_buffers=False,
        # )

    arguments = {}
    arguments["iteration"] = 0

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    output_dir = os.path.join(
        cfg.OUTPUT_DIR, cfg.SUBDIR,
        'GPU' + str(num_gpus) + '_LR' + str(cfg.SOLVER.BASE_LR))

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        reid_model,
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #29
0
def train(cfg, local_rank, distributed, random_number_generator=None):
    # Model logging
    mlperf_print(key=constants.GLOBAL_BATCH_SIZE,
                 value=cfg.SOLVER.IMS_PER_BATCH)
    mlperf_print(key=constants.NUM_IMAGE_CANDIDATES,
                 value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN)

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    # Optimizer logging
    # mlperf_print(key=constants.OPT_NAME, value="sgd_with_momentum")
    mlperf_print(key=constants.OPT_BASE_LR, value=cfg.SOLVER.BASE_LR)
    mlperf_print(key=constants.OPT_LR_WARMUP_STEPS,
                 value=cfg.SOLVER.WARMUP_ITERS)
    mlperf_print(key=constants.OPT_LR_WARMUP_FACTOR,
                 value=cfg.SOLVER.WARMUP_FACTOR)

    scheduler = make_lr_scheduler(cfg, optimizer)

    # disable the garbage collection
    gc.disable()

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    # amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(
        model, optimizer,
        opt_level=amp_opt_level)  # , verbose=cfg.AMP_VERBOSE)

    if distributed:
        model = DDP(model, delay_allreduce=True)

    arguments = {}
    arguments["iteration"] = 0
    arguments["nhwc"] = cfg.NHWC
    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC)
    arguments.update(extra_checkpoint_data)

    # At this point we've loaded relevant checkpoint(s) and can now cast
    # FrozenBatchNorm2d layers to half() if necessary.
    # This allows us to move parameter casting logic out of the BN code itself,
    # which was preventing us from annotating bn.forward with @script_method, which
    # was preventing the cross-module fusion of BN with ReLU / Add-ReLU.
    if use_mixed_precision:
        model = cast_frozen_bn_to_half(model)

    mlperf_print(key=constants.INIT_STOP, sync=True)
    mlperf_print(key=constants.RUN_START, sync=True)
    barrier()

    data_loader, iters_per_epoch = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
        random_number_generator=random_number_generator,
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # set the callback function to evaluate and potentially
    # early exit each epoch
    if cfg.PER_EPOCH_EVAL:
        per_iter_callback_fn = functools.partial(
            mlperf_test_early_exit,
            iters_per_epoch=iters_per_epoch,
            tester=functools.partial(test, cfg=cfg),
            model=model,
            distributed=distributed,
            min_bbox_map=cfg.MLPERF.MIN_BBOX_MAP,
            min_segm_map=cfg.MLPERF.MIN_SEGM_MAP)
    else:
        per_iter_callback_fn = None

    start_train_time = time.time()

    success = do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        cfg.DISABLE_REDUCED_LOGGING,
        per_iter_start_callback_fn=functools.partial(
            mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
        per_iter_end_callback_fn=per_iter_callback_fn,
    )

    end_train_time = time.time()
    total_training_time = end_train_time - start_train_time
    print("&&&& MLPERF METRIC THROUGHPUT={:.4f} iterations / s".format(
        (arguments["iteration"] * cfg.SOLVER.IMS_PER_BATCH) /
        total_training_time))

    return model, success
Exemple #30
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.deprecated.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )
    logger = logging.getLogger("Training")
    with tools.TimerBlock("Loading Experimental setups", logger) as block:
        exp_name = cfg.EXP.NAME
        output_dir = tools.get_exp_output_dir(exp_name, cfg.OUTPUT_DIR)
        checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
        validation_period = cfg.SOLVER.VALIDATION_PERIOD

    with tools.TimerBlock("Loading Checkpoints...", logger) as block:
        arguments = {}
        save_to_disk = local_rank == 0
        checkpointer = Checkpointer(
            model,
            save_dir=output_dir,
            save_to_disk=save_to_disk,
            num_class=cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES,
        )
        extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
        arguments.update(extra_checkpoint_data)
        arguments["iteration"] = 0

    with tools.TimerBlock("Initializing DAVIS Datasets", logger) as block:
        logger.info("Loading training set...")
        data_loader_train = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
        )
        logger.info("Loading valid set...")
        data_loaders_valid = make_data_loader(
            cfg,
            is_train=False,
            is_distributed=distributed,
        )

    do_train(
        model,
        data_loader_train,
        data_loaders_valid[0],
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        validation_period,
        arguments,
        exp_name,
    )

    return model
Exemple #31
0
def train(cfg, local_rank, distributed):
    model_det = build_detection_model(cfg)
    model_G = model_det.backbone
    # model_G = copy.deepcopy(model_det).backbone
    # model_D = define_D(256, 64, which_model_netD='det', n_layers_D=5)
    model_D = DEBUG_DESC(256, 64, n_layers=5, use_sigmoid=True)
    models = [model_det, model_G, model_D]

    device = torch.device(cfg.MODEL.DEVICE)
    for model in models:
        model.to(device)

    optimizer_det = make_optimizer(cfg, model_det)
    scheduler_det = make_lr_scheduler(cfg, optimizer_det)

    # optimizer_G = make_optimizer_Adam(cfg, model_det.backbone)
    optimizer_G = make_optimizer(cfg, model_det.backbone)
    scheduler_G = None

    optimizer_D = make_optimizer_Adam(cfg, model_D)
    # optimizer_D = make_optimizer(cfg, model_D)
    scheduler_D = None

    optimizers = [optimizer_det, optimizer_D, optimizer_G]
    schedulers = [scheduler_det, scheduler_D, scheduler_G]

    if distributed:
        for i, model in enumerate(models):
            models[i] = torch.nn.parallel.deprecated.DistributedDataParallel(
                models[i], device_ids=[local_rank], output_device=local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            )

    arguments = {}
    manual_iter = 0
    print("WARNING! MANUAL ITERATION IS", manual_iter)
    arguments["iteration"] = manual_iter

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer_det = DetectronCheckpointer(
        cfg, model_det, optimizer_det, scheduler_det, output_dir, save_to_disk
    )
    checkpointer_D = Checkpointer(
        model_D, optimizer_D, None, output_dir, save_to_disk
    )
    checkpointers = [checkpointer_det, checkpointer_D]
    print('WARNING! REMOVED "iteration" from train_net.py')
    extra_checkpoint_data = checkpointer_det.load(cfg.MODEL.WEIGHT)
    extra_checkpoint_data = {"iteration" : 0}
    arguments.update(extra_checkpoint_data)

    data_loaders = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
    arguments["need_adapt"] = False
    arguments["need_train_A"] = False
    arguments["need_train_B"] = True
    do_train(
        models,
        data_loaders,
        data_loaders_val,
        optimizers,
        schedulers,
        checkpointers,
        device,
        checkpoint_period,
        arguments,
    )

    return model