Exemple #1
0
def train(cfg, local_rank, distributed, use_tensorboard=False, logger=None, start_iter=0):
    arguments = {"iteration": start_iter}
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    if cfg.SOLVER.UNFREEZE_CONV_BODY:
        for p in model.backbone.parameters():
            p.requires_grad = True

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer, start_iter)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler, output_dir, save_to_disk, logger=logger)
    print(cfg.TRAIN.IGNORE_LIST)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, ignore_list=cfg.TRAIN.IGNORE_LIST)
    arguments.update(extra_checkpoint_data)

    if cfg.SOLVER.KEEP_LR:
        optimizer = make_optimizer(cfg, model)
        scheduler = make_lr_scheduler(cfg, optimizer, start_iter)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    tensorboard_logdir = cfg.OUTPUT_DIR
    tensorboard_exp_name = cfg.TENSORBOARD_EXP_NAME
    snapshot = cfg.SOLVER.SNAPSHOT_ITERS

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        snapshot,
        tensorboard_logdir,
        tensorboard_exp_name,
        use_tensorboard=use_tensorboard
    )

    return model
Exemple #2
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    print(model)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = os.path.join(cfg.OUTPUT_DIR, cfg.FILE)

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader_train = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )
    data_loader_val = make_data_loader(
        cfg,
        is_train=False,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    val_period = cfg.SOLVER.VAL_PERIOD

    do_train(
        model,
        data_loader_train,
        data_loader_val,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        val_period,
        arguments,
        distributed,
    )

    return model
Exemple #3
0
def train(cfg, local_rank, distributed, tb_logger):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT,
                                              resume=cfg.SOLVER.RESUME)
    if cfg.SOLVER.RESUME:
        arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        tb_logger,
        cfg,
        local_rank,
    )

    return model
Exemple #4
0
    def test(self, output_dir=None, model_to_test=None):
        if output_dir is not None:
            self.cfg.OUTPUT_DIR = output_dir
        model = build_detection_model(self.cfg)
        device = torch.device(self.cfg.MODEL.DEVICE)
        model.to(device)

        arguments = {}
        arguments["iteration"] = 0

        output_dir = self.cfg.OUTPUT_DIR

        save_to_disk = get_rank() == 0
        checkpointer = DetectronCheckpointer(
            self.cfg, model, None, None, output_dir, save_to_disk
        )

        if model_to_test is not None:
            self.cfg.MODEL.WEIGHT = model_to_test

        if self.cfg.MODEL.WEIGHT.startswith('/') or 'catalog' in self.cfg.MODEL.WEIGHT:
            model_path = self.cfg.MODEL.WEIGHT
        else:
            model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir, os.path.pardir, 'Data', 'pretrained_feature_extractors', self.cfg.MODEL.WEIGHT))

        extra_checkpoint_data = checkpointer.load(model_path, use_latest=False)

        checkpointer.optimizer = make_optimizer(self.cfg, checkpointer.model)
        checkpointer.scheduler = make_lr_scheduler(self.cfg, checkpointer.optimizer)

        # Initialize mixed-precision training
        use_mixed_precision = self.cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(checkpointer.model, checkpointer.optimizer, opt_level=amp_opt_level)

        if self.distributed:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[self.local_rank], output_device=self.local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            )
        synchronize()
        _ = inference(  # The result can be used for additional logging, e. g. for TensorBoard
            model,
            # The method changes the segmentation mask format in a data loader,
            # so every time a new data loader is created:
            make_data_loader(self.cfg, is_train=False, is_distributed=(get_world_size() > 1), is_target_task=self.is_target_task),
            dataset_name="[Test]",
            iou_types=("bbox",),
            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=None,
            is_target_task=self.is_target_task,
        )
        synchronize()

        logger = logging.getLogger("maskrcnn_benchmark")
        logger.handlers=[]
    def __init__(
        self,
        cfg,
        confidence_threshold=0.7,
        show_mask_heatmaps=False,
        masks_per_dim=2,
        min_image_size=224,
    ):
        self.cfg = cfg.clone()
        self.model = build_detection_model(cfg)
        self.model.eval()
        self.device = torch.device(cfg.MODEL.DEVICE)
        self.model.to(self.device)
        self.min_image_size = min_image_size

        save_dir = cfg.OUTPUT_DIR
        optimizer = make_optimizer(cfg, self.model)
        scheduler = make_lr_scheduler(cfg, optimizer)
        checkpointer = DetectronCheckpointer(cfg, self.model, optimizer=optimizer, scheduler=scheduler, save_dir=save_dir)
#         checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir)
        _ = checkpointer.load(cfg.MODEL.WEIGHT)

        self.transforms = self.build_transform()

        mask_threshold = -1 if show_mask_heatmaps else 0.5
        self.masker = Masker(threshold=mask_threshold, padding=1)

        # used to make colors for each class
        self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])

        self.cpu_device = torch.device("cpu")
        self.confidence_threshold = confidence_threshold
        self.show_mask_heatmaps = show_mask_heatmaps
        self.masks_per_dim = masks_per_dim
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    summary_writer = SummaryWriter(log_dir=output_dir)
    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)

    if cfg.MODEL.WEIGHT.upper() == 'CONTINUE':
        model_weight = last_checkpoint(output_dir)
    else:
        model_weight = cfg.MODEL.WEIGHT
    extra_checkpoint_data = checkpointer.load(model_weight)

    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    data_loader_val = make_data_loader(cfg,
                                       is_train=False,
                                       is_distributed=distributed)[0]

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(model=model,
             data_loader=data_loader,
             data_loader_val=data_loader_val,
             optimizer=optimizer,
             scheduler=scheduler,
             checkpointer=checkpointer,
             device=device,
             checkpoint_period=checkpoint_period,
             arguments=arguments,
             summary_writer=summary_writer)

    return model
Exemple #7
0
    def __init__(self, exp_dict):
        super().__init__()
        cfg_base_path = "./models/configs/"

        self.n_classes = 21
        cfg_path = cfg_base_path + "e2e_mask_rcnn_R_50_FPN_1x.yaml"

        self.cfg = cfg
        self.cfg.merge_from_file(cfg_path)

        self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES = self.n_classes

        # ---------------
        # build model
        self.backbone_fpn = backbone.build_backbone(self.cfg)
        self.rpn = rpn.build_rpn(self.cfg, self.backbone_fpn.out_channels)
        self.roi_heads = roi_heads.build_roi_heads(
            self.cfg, self.backbone_fpn.out_channels)

        # ---------------
        # load checkpoint
        checkpoint = _load_file(self.cfg)
        load_state_dict(self, checkpoint.pop("model"))

        #--------
        # Opt stage
        self.cfg.SOLVER.BASE_LR = ((0.0025 * 8) /
                                   (16 / float(exp_dict["batch_size"])))

        optimizer = make_optimizer(self.cfg, self)
        scheduler = make_lr_scheduler(self.cfg, optimizer)

        self.opt = optimizer
        self.scheduler = scheduler
Exemple #8
0
def comput_on_dataset_with_finetune(model, data_loader, device, cfg):
    results_dict = {}
    cpu_device = torch.device("cpu")
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    for i, batch in tqdm(enumerate(data_loader)):
        # assert len(batch[0]) == 1, "Finetune only Support batchsize = 1"

        dataset = data_loader.dataset
        video_id = dataset.get_annotation_video_id(i)
        img_id = dataset.get_annotation_img_id(i)

        images, targets, image_ids = batch
        images = images.to(device)
        targets = [target.to(device) for target in targets]

        if img_id == "00000":
            logger = logging.getLogger("DAVIS_MaskRCNN_baseline_test")
            logger.info('-' * 50)
            logger.info("Fintune: {}".format(video_id))
            logger.info('-' * 50)
            model = finetune_first_image(model, images, targets, optimizer, scheduler, logger, cfg)

        model.eval()
        with torch.no_grad():
            output = model(images)
            output = [o.to(cpu_device) for o in output]
        results_dict.update(
            {img_id: result for img_id, result in zip(image_ids, output)}
        )
    return results_dict
Exemple #9
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    if cfg.USE_TENSORBOARD_LOGS:
        meters = TensorboardLogger(
            log_dir=os.path.join(output_dir, 'tensorboard_logs'),
            start_iter=arguments['iteration'],
            delimiter="  ",
        )
    else:
        meters = MetricLogger(delimiter="  ")

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        meters,
    )

    return model
Exemple #10
0
def train(cfg, local_rank, distributed):
    # 创建GeneralizedRCNN()对象
    # detectors.py --> generalized_rcnn.py
    model = build_detection_model(cfg)
    # print(model)

    # 'cpu' or 'cuda'
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # 封装了 torch.optiom.SGD() 函数, 根据tensor的requires_grad属性构成需要更新的参数列表
    optimizer = make_optimizer(cfg, model)

    # 根据配置信息设置 optimizer 的学习率更新策略
    scheduler = make_lr_scheduler(cfg, optimizer)

    # 分布式训练情况下, 并行处理数据
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    # 获取输出的文件夹路径, 默认为 '.'
    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)  # 字典的update方法, 对字典的键值进行更新

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    if checkpointer.classes is None:
        for ds in data_loader.dataset.datasets:
            ds.find_classes()
        checkpointer.classes = data_loader.dataset.datasets[0].class_to_ind
    else:
        print("Loading classes from file")
        print(checkpointer.classes)
        for ds in data_loader.dataset.datasets:
            ds.class_to_ind = checkpointer.classes

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
def train(cfg, local_rank, distributed):
    # use following line to avoid shared file limit
    # torch.multiprocessing.set_sharing_strategy('file_system')
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Convert Model for SyncBN
    if cfg.SYNCBN:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            # broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #13
0
def train(cfg, local_rank, distributed):
    # ################################################################### fusion_factors # add by G
    if cfg.MODEL.FPN.STATISTICS_ALPHA_ON == True:
        sta_module = StaAlphaModule(cfg)
        fusion_factors = sta_module.process()
    else:
        fusion_factors = cfg.MODEL.FPN.FUSION_FACTORS
    # ################################################################### fusion_factors # add by G

    model = build_detection_model(cfg, fusion_factors)

    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #14
0
def train(cfg, local_rank, distributed, use_tensorboard=False):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    if use_tensorboard:
        arguments["tb_log_dir"] = cfg.TENSORBOARD_LOGDIR
        arguments["tb_exp_name"] = cfg.TENSORBOARD_EXP_NAME

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    print(data_loader.dataset)

    for iteration, (images, targets, _) in enumerate(data_loader, 0):
        print(">>>>> train iteration:", iteration)

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #15
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)
    #HACK: force the steps, could not change the lr from ckpt now.
    scheduler.milestones = cfg.SOLVER.STEPS
    # change lr
    #lr_ratio = cfg.SOLVER.BASE_LR / scheduler.base_lrs[-1]
    #scheduler.base_lrs = [ base_lr * lr_ratio for base_lr in self.base_lrs ]
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    tbwriter = SummaryWriter(cfg.OUTPUT_DIR)
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        tbwriter,
    )

    return model
def train(cfg, local_rank, distributed, experiment=None):
    if not cfg.TASK.KIND in kind_builder:
        raise Exception('unknown task: {0}'.format(cfg.TASK.KIND))

    model_builder = kind_builder[cfg.TASK.KIND]
    model = model_builder(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)

    # todo, make this relative?
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    do_train(cfg,
             model,
             data_loader,
             optimizer,
             scheduler,
             checkpointer,
             device,
             checkpoint_period,
             arguments,
             experiment=experiment)

    return model
Exemple #17
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    if cfg.MODEL.USE_SYNCBN:
        assert is_pytorch_1_1_0_or_later(), \
            "SyncBatchNorm is only available in pytorch >= 1.1.0"
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #18
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR
    print('bbbbbbbbbbbbbbbbbbbbb')
    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT,
                                              resume=cfg.SOLVER.RESUME)
    if cfg.SOLVER.RESUME:
        arguments.update(extra_checkpoint_data)
    print('ccccccccccccccccccccccccccc')
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )
    print('dddddddddddddddddddddddddddddd')
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    tb_logger = Logger(cfg.OUTPUT_DIR)
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        tb_logger,
        cfg,
    )
    print('eeeeeeeeeeeeeeeeeeeeeeeeeee')
    return model
Exemple #19
0
def train(cfg, train_dir, local_rank, distributed, logger):

    # build model
    model = build_siammot(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=False,
            find_unused_parameters=True)

    arguments = {}
    arguments["iteration"] = 0

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         train_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = build_train_data_loader(
        cfg,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    tensorboard_writer = TensorboardWriter(cfg, train_dir)

    do_train(model, data_loader, optimizer, scheduler, checkpointer, device,
             checkpoint_period, arguments, logger, tensorboard_writer)

    return model
Exemple #20
0
def train(cfg, local_rank, distributed, save_path='.', writer=None):
    # cfg.SOLVER.IMS_PER_BATCH =3# force it to 3
    model_s = build_detection_model(cfg, is_student=True)
    model_t = build_detection_model(cfg, is_teacher=True)
    device_t = torch.device('cuda:0')
    device_s = torch.device('cuda:0')
    model_s.to(device_s)
    model_t.to(device_t)
    optimizer = make_optimizer(cfg, model_s)
    scheduler = make_lr_scheduler(cfg, optimizer)
    output_dir = save_path
    save_to_disk = get_rank() == 0
    checkpointer_s = DetectronCheckpointer(cfg, model_s, optimizer, scheduler,
                                           output_dir, save_to_disk)
    checkpointer_t = DetectronCheckpointer(cfg,
                                           model_t,
                                           optimizer=None,
                                           scheduler=scheduler,
                                           save_dir=output_dir,
                                           save_to_disk=save_to_disk)
    _init_weight = 'e2e_mask_rcnn_R_50_FPN_1x.pth'
    _ = checkpointer_s.load(_init_weight, True)
    _ = checkpointer_t.load(_init_weight, True)
    sourceDataLoader = make_mt_data_loader(cfg,
                                           is_train=True,
                                           is_distributed=distributed,
                                           start_iter=0,
                                           mode='source',
                                           img_ratio=1 / 2)
    data_loader_dict = {
        'source': sourceDataLoader,
    }
    if cfg.DATASETS.NO_LABEL:
        noLabelDataLoader = make_mt_data_loader(cfg,
                                                is_train=True,
                                                is_distributed=distributed,
                                                start_iter=0,
                                                mode='no_label',
                                                img_ratio=1 / 2)

        data_loader_dict.update({'no_label': noLabelDataLoader})
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    trainer = MTtrainer(model_s,model_t,data_loader_dict,optimizer,
                        scheduler, checkpointer_s,checkpointer_t,\
              checkpoint_period, cfg)

    trainer.train()
    return model_s
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.deprecated.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
    )

    return model
Exemple #22
0
    def load(self, f=None):
        # if self.has_checkpoint():
        #     # override argument with existing checkpoint
        #     f = self.get_checkpoint_file()
        if not f:
            # no checkpoint could be found
            self.logger.info(
                "No checkpoint found. Initializing model from scratch")
            log_optimizer_scheduler_info(self.logger, self.optimizer,
                                         self.scheduler)
            return {}
        self.logger.info("Loading checkpoint from {}".format(f))
        checkpoint = self._load_file(f)
        self._load_model(checkpoint)
        if self.cfg.PRIORITY_CONFIG:
            temp_optimizer = make_optimizer(self.cfg, self.model)
            self.optimizer.load_state_dict(temp_optimizer.state_dict())

            for group in self.optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])

            iteration = checkpoint[
                'iteration'] if 'iteration' in checkpoint else 0
            last_epoch = iteration - 1
            temp_scheduler = make_lr_scheduler(self.cfg,
                                               self.optimizer,
                                               last_epoch=last_epoch)
            self.scheduler.load_state_dict(temp_scheduler.state_dict())

            # remove processed stat data
            for stat_name in ["optimizer", "scheduler"]:
                if stat_name in checkpoint:
                    checkpoint.pop(stat_name)
        else:
            if "optimizer" in checkpoint and self.optimizer:
                self.logger.info("Loading optimizer from {}".format(f))
                self.optimizer.load_state_dict(checkpoint.pop("optimizer"))
            if "scheduler" in checkpoint and self.scheduler:
                self.logger.info("Loading scheduler from {}".format(f))
                self.scheduler.load_state_dict(checkpoint.pop("scheduler"))

        if self.optimizer and self.scheduler:
            log_optimizer_scheduler_info(self.logger, self.optimizer,
                                         self.scheduler)

        # return any further checkpoint data
        return checkpoint
Exemple #23
0
def train(cfg, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    warmup_layers = tuple(x for x in cfg.SOLVER.WARMUP_LAYERS if len(x) != 0)
    warmup_iters = cfg.SOLVER.WARMUP_ITERS

    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        warmup_layers,
        warmup_iters
    )
    return model
    def fit(self, train_data, test_data, classes):
        self.classes = classes
        optimizer = make_optimizer(self.cfg, self.model)
        scheduler = make_lr_scheduler(self.cfg, optimizer)

        arguments = {}
        arguments["iteration"] = 0

        self.checkpointer.classes = classes
        self.checkpointer.optimizer = optimizer
        self.checkpointer.scheduler = scheduler

        train_data_loader = make_data_loader(
            self.cfg,
            train_data,
            classes,
            is_train=True,
            is_distributed=False,
            start_iter=arguments["iteration"],
        )

        test_data_loader = make_data_loader(
            self.cfg,
            test_data,
            classes,
            is_train=True,
            is_distributed=False,
            start_iter=arguments["iteration"],
        )

        self.train_meter, self.test_meter = do_train(
            self.model,
            train_data_loader,
            test_data_loader,
            optimizer,
            scheduler,
            self.device,
            arguments,
        )
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    # model_D = build_discriminator(cfg)
    model_D = model.get_discriminator()

    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    # model_D.to(device)
    models = [model, model_D]

    optimizers = []
    for p in model_D.parameters():
        p.requires_grad = False
    optimizers.append(make_optimizer(cfg, model))
    for p in model_D.parameters():
        p.requires_grad = True
    optimizers.append(make_optimizer(cfg, model_D))
    schedulers = [
        make_lr_scheduler(cfg, optimizer) for optimizer in optimizers
    ]

    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    models, optimizers = zip(*[
        amp.initialize(model, optimizer, opt_level=amp_opt_level)
        for model, optimizer in zip(models, optimizers)
    ])

    if distributed:
        models = [
            torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[local_rank],
                output_device=local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            ) for model in models
        ]

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointers = [
        DetectronCheckpointer(cfg, model, optimizers[0], schedulers[0],
                              output_dir, save_to_disk),
        DetectronCheckpointer(cfg, model_D, optimizers[1], schedulers[1],
                              output_dir, False),
    ]
    extra_checkpoint_data = [
        checkpointer.load(cfg.MODEL.WEIGHT) for checkpointer in checkpointers
    ]
    arguments.update(extra_checkpoint_data[0])

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    tflogger = SummaryWriter(log_dir=os.path.join(output_dir, "logs"))

    do_train(
        models,
        data_loader,
        optimizers,
        schedulers,
        checkpointers,
        device,
        checkpoint_period,
        arguments,
        tflogger,
    )

    return model
Exemple #26
0
    def train(self,
              output_dir=None,
              fine_tune_last_layers=False,
              fine_tune_rpn=False):
        if output_dir is not None:
            self.cfg.OUTPUT_DIR = output_dir
        model = build_detection_model(self.cfg)
        device = torch.device(self.cfg.MODEL.DEVICE)
        model.to(device)

        arguments = {}
        arguments["iteration"] = 0

        output_dir = self.cfg.OUTPUT_DIR

        save_to_disk = get_rank() == 0
        checkpointer = DetectronCheckpointer(self.cfg, model, None, None,
                                             output_dir, save_to_disk)

        if self.cfg.MODEL.WEIGHT.startswith(
                '/') or 'catalog' in self.cfg.MODEL.WEIGHT:
            model_path = self.cfg.MODEL.WEIGHT
        else:
            model_path = os.path.abspath(
                os.path.join(os.path.dirname(__file__), os.path.pardir,
                             os.path.pardir, os.path.pardir, os.path.pardir,
                             'Data', 'pretrained_feature_extractors',
                             self.cfg.MODEL.WEIGHT))

        extra_checkpoint_data = checkpointer.load(model_path)

        if self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1 != self.cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES:
            checkpointer.model.roi_heads.box.predictor.cls_score = torch.nn.Linear(
                in_features=checkpointer.model.roi_heads.box.predictor.
                cls_score.in_features,
                out_features=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1,
                bias=True)
            checkpointer.model.roi_heads.box.predictor.bbox_pred = torch.nn.Linear(
                in_features=checkpointer.model.roi_heads.box.predictor.
                cls_score.in_features,
                out_features=(self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1)
                * 4,
                bias=True)
            if hasattr(checkpointer.model.roi_heads, 'mask'):
                checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits = torch.nn.Conv2d(
                    in_channels=checkpointer.model.roi_heads.mask.predictor.
                    mask_fcn_logits.in_channels,
                    out_channels=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES +
                    1,
                    kernel_size=(1, 1),
                    stride=(1, 1))
            checkpointer.model.to(device)

        if fine_tune_last_layers:
            checkpointer.model.roi_heads.box.predictor.cls_score = torch.nn.Linear(
                in_features=checkpointer.model.roi_heads.box.predictor.
                cls_score.in_features,
                out_features=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1,
                bias=True)
            checkpointer.model.roi_heads.box.predictor.bbox_pred = torch.nn.Linear(
                in_features=checkpointer.model.roi_heads.box.predictor.
                cls_score.in_features,
                out_features=(self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES + 1)
                * 4,
                bias=True)
            if hasattr(checkpointer.model.roi_heads, 'mask'):
                checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits = torch.nn.Conv2d(
                    in_channels=checkpointer.model.roi_heads.mask.predictor.
                    mask_fcn_logits.in_channels,
                    out_channels=self.cfg.MINIBOOTSTRAP.DETECTOR.NUM_CLASSES +
                    1,
                    kernel_size=(1, 1),
                    stride=(1, 1))
            # Freeze backbone layers
            for elem in checkpointer.model.backbone.parameters():
                elem.requires_grad = False
            if not fine_tune_rpn:
                # Freeze RPN layers
                for elem in checkpointer.model.rpn.parameters():
                    elem.requires_grad = False
            else:
                for elem in checkpointer.model.rpn.head.conv.parameters():
                    elem.requires_grad = False
                checkpointer.model.rpn.head.cls_logits = torch.nn.Conv2d(
                    in_channels=checkpointer.model.rpn.head.cls_logits.
                    in_channels,
                    out_channels=checkpointer.model.rpn.head.cls_logits.
                    out_channels,
                    kernel_size=(1, 1),
                    stride=(1, 1))
                checkpointer.model.rpn.head.bbox_pred = torch.nn.Conv2d(
                    in_channels=checkpointer.model.rpn.head.bbox_pred.
                    in_channels,
                    out_channels=checkpointer.model.rpn.head.bbox_pred.
                    out_channels,
                    kernel_size=(1, 1),
                    stride=(1, 1))
            # Freeze roi_heads layers with the exception of the predictor ones
            for elem in checkpointer.model.roi_heads.box.feature_extractor.parameters(
            ):
                elem.requires_grad = False
            for elem in checkpointer.model.roi_heads.box.predictor.parameters(
            ):
                elem.requires_grad = True
            if hasattr(checkpointer.model.roi_heads, 'mask'):
                for elem in checkpointer.model.roi_heads.mask.predictor.parameters(
                ):
                    elem.requires_grad = False
                for elem in checkpointer.model.roi_heads.mask.predictor.mask_fcn_logits.parameters(
                ):
                    elem.requires_grad = True
            checkpointer.model.to(device)

        checkpointer.optimizer = make_optimizer(self.cfg, checkpointer.model)
        checkpointer.scheduler = make_lr_scheduler(self.cfg,
                                                   checkpointer.optimizer)

        # Initialize mixed-precision training
        use_mixed_precision = self.cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(checkpointer.model,
                                          checkpointer.optimizer,
                                          opt_level=amp_opt_level)

        if self.distributed:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[self.local_rank],
                output_device=self.local_rank,
                # this should be removed if we update BatchNorm stats
                broadcast_buffers=False,
            )

        data_loader = make_data_loader(self.cfg,
                                       is_train=True,
                                       is_distributed=self.distributed,
                                       start_iter=arguments["iteration"],
                                       is_target_task=self.is_target_task)

        test_period = self.cfg.SOLVER.TEST_PERIOD
        if test_period > 0:
            data_loader_val = make_data_loader(
                self.cfg,
                is_train=False,
                is_distributed=self.distributed,
                is_target_task=self.is_target_task)
        else:
            data_loader_val = None

        checkpoint_period = self.cfg.SOLVER.CHECKPOINT_PERIOD
        do_train(self.cfg,
                 model,
                 data_loader,
                 data_loader_val,
                 checkpointer.optimizer,
                 checkpointer.scheduler,
                 checkpointer,
                 device,
                 checkpoint_period,
                 test_period,
                 arguments,
                 is_target_task=self.is_target_task)

        logger = logging.getLogger("maskrcnn_benchmark")
        logger.handlers = []
Exemple #27
0
def main():

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # model = train(cfg, args.local_rank, args.distributed)
    model = build_detection_model(cfg)
    # add
    print(model)
    all_index = []
    for index, item in enumerate(model.named_parameters()):
        all_index.append(index)
        print(index)
        print(item[0])
        print(item[1].size())
    print("All index of the model: ", all_index)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=args.distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

    # run_test(cfg, model, args.distributed)
    # pruning
    m = Mask(model)
    m.init_length()
    m.init_length()
    print("-" * 10 + "one epoch begin" + "-" * 10)
    print("remaining ratio of pruning : Norm is %f" % args.rate_norm)
    print("reducing ratio of pruning : Distance is %f" % args.rate_dist)
    print("total remaining ratio is %f" % (args.rate_norm - args.rate_dist))

    m.modelM = model
    m.init_mask(args.rate_norm, args.rate_dist)

    m.do_mask()
    m.do_similar_mask()
    model = m.modelM
    m.if_zero()
    # run_test(cfg, model, args.distributed)

    # change to use straightforward function to make its easy to implement Mask
    # do_train(
    #     model,
    #     data_loader,
    #     optimizer,
    #     scheduler,
    #     checkpointer,
    #     device,
    #     checkpoint_period,
    #     arguments,
    # )
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)
        # print("Loss dict",loss_dict)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()

        # prun
        # Mask grad for iteration
        m.do_grad_mask()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        # prun
        # 7375 is number iteration to train 1 epoch with batch-size = 16 and number train dataset exam is 118K (in coco)
        if iteration % args.iter_pruned == 0 or iteration == cfg.SOLVER.MAX_ITER - 5000:
            m.modelM = model
            m.if_zero()
            m.init_mask(args.rate_norm, args.rate_dist)
            m.do_mask()
            m.do_similar_mask()
            m.if_zero()
            model = m.modelM
            if args.use_cuda:
                model = model.cuda()
            #run_test(cfg, model, args.distributed)

        if iteration % 20 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join([
                    "eta: {eta}",
                    "iter: {iter}",
                    "{meters}",
                    "lr: {lr:.6f}",
                    "max mem: {memory:.0f}",
                ]).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                ))

        if iteration % checkpoint_period == 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)
        if iteration == max_iter:
            checkpointer.save("model_final", **arguments)

    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info("Total training time: {} ({:.4f} s / it)".format(
        total_time_str, total_training_time / (max_iter)))

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
Exemple #28
0
def train(cfg, local_rank, distributed):
    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.deprecated.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )
    logger = logging.getLogger("Training")
    with tools.TimerBlock("Loading Experimental setups", logger) as block:
        exp_name = cfg.EXP.NAME
        output_dir = tools.get_exp_output_dir(exp_name, cfg.OUTPUT_DIR)
        checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
        validation_period = cfg.SOLVER.VALIDATION_PERIOD

    with tools.TimerBlock("Loading Checkpoints...", logger) as block:
        arguments = {}
        save_to_disk = local_rank == 0
        checkpointer = Checkpointer(
            model,
            save_dir=output_dir,
            save_to_disk=save_to_disk,
            num_class=cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES,
        )
        extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
        arguments.update(extra_checkpoint_data)
        arguments["iteration"] = 0

    with tools.TimerBlock("Initializing DAVIS Datasets", logger) as block:
        logger.info("Loading training set...")
        data_loader_train = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=arguments["iteration"],
        )
        logger.info("Loading valid set...")
        data_loaders_valid = make_data_loader(
            cfg,
            is_train=False,
            is_distributed=distributed,
        )

    do_train(
        model,
        data_loader_train,
        data_loaders_valid[0],
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        validation_period,
        arguments,
        exp_name,
    )

    return model
    synchronize()

cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)

cfg.MODEL.RPN.RNN.COMBINATION = 'attention_norm'
cfg.OUTPUT_DIR = 'stage3_attention_norm'
cfg.DATASETS.USE_ANNO_CACHE = False

cfg.freeze()

model = build_detection_model(cfg)
device = torch.device(cfg.MODEL.DEVICE)
model.to(device)

optimizer = make_optimizer(cfg, model)
scheduler = make_lr_scheduler(cfg, optimizer)

if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
        # this should be removed if we update BatchNorm stats
        broadcast_buffers=False,
    )

arguments = {}
arguments["iteration"] = 0

output_dir = cfg.OUTPUT_DIR
def train(cfg, local_rank, distributed):
    model, head = build_dist_face_trainer(cfg, local_rank)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    if cfg.MODEL.USE_SYNCBN:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if True:
        model = FaceDistributedDataParallel(
            model,
            device_ids=local_rank,
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
            chunk_sizes=None,  #[32,56,56,56]
        )
        head_local_rank = None
        if len(local_rank) == 1:
            head_local_rank = local_rank
        head = FaceDistributedDataParallel(
            head,
            device_ids=head_local_rank,
            output_device=head_local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )
    model = torch.nn.Sequential(*[model, head])
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)
    # head_optimizer = make_optimizer(cfg, head)
    # head_scheduler = make_lr_scheduler(cfg, head_optimizer)
    # Initialize mixed-precision training
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level=amp_opt_level)
    # head, head_optimizer = amp.initialize(head, head_optimizer, opt_level=amp_opt_level)

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.OUTPUT_DIR
    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg, model, optimizer, scheduler,
                                         output_dir, save_to_disk)
    # head_checkpointer = DetectronCheckpointer(
    #     cfg, head, head_optimizer, head_scheduler, output_dir, save_to_disk
    # )
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    #### init transforms #####
    transforms = T.Compose([
        T.RandomCrop((cfg.INPUT.SIZE_TRAIN[0], cfg.INPUT.SIZE_TRAIN[1])),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize(mean=cfg.INPUT.RGB_MEAN, std=cfg.INPUT.RGB_STD),
    ])
    data_loader = make_face_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
        transforms=transforms,
    )
    test_period = cfg.SOLVER.TEST_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    divs_nums = cfg.SOLVER.DIVS_NUMS_PER_BATCH
    do_face_train_dist_DIV_FC(
        cfg,
        model,  #[model,head],
        data_loader,
        None,
        optimizer,  #[optimizer,head_optimizer],
        scheduler,  #[scheduler,head_scheduler],
        checkpointer,  #[checkpointer,head_checkpointer],
        device,
        checkpoint_period,
        test_period,
        arguments,
        divs_nums,
    )
    return model
def train(cfg, local_rank, distributed, loop, only_test, min_loss):
    ay = cfg.TEST.EVAL_AUG_THICKNESS_Y_TAR_ANC
    az = cfg.TEST.EVAL_AUG_THICKNESS_Z_TAR_ANC
    EVAL_AUG_THICKNESS = {
        'target_Y': ay[0],
        'anchor_Y': ay[1],
        'target_Z': az[0],
        'anchor_Z': az[1],
    }

    model = build_detection_model(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)
    roi_only = cfg.MODEL.ROI__ONLY
    if roi_only:
        freeze_rpn_layers(model)

    optimizer = make_optimizer(cfg, model)

    arguments = {}
    arguments["iteration"] = 0
    data_loader = make_data_loader(cfg,
                                   is_train=True,
                                   is_distributed=distributed,
                                   start_iter=arguments["iteration"])

    scheduler = make_lr_scheduler(cfg, optimizer)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    output_dir = cfg.OUTPUT_DIR

    save_to_disk = get_rank() == 0
    checkpointer = DetectronCheckpointer(cfg,
                                         model,
                                         optimizer,
                                         scheduler,
                                         output_dir,
                                         save_to_disk,
                                         roi_only=roi_only)
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    if only_test:
        return model, min_loss

    checkpoint_period = int(cfg.SOLVER.CHECKPOINT_PERIOD_EPOCHS *
                            cfg.INPUT.Example_num / cfg.SOLVER.IMS_PER_BATCH)

    epochs_between_test = cfg.SOLVER.EPOCHS_BETWEEN_TEST
    loss_weights = cfg.MODEL.LOSS.WEIGHTS
    for e in range(epochs_between_test):
        min_loss = do_train(model,
                            data_loader,
                            optimizer,
                            scheduler,
                            checkpointer,
                            device,
                            checkpoint_period,
                            arguments,
                            e + loop * epochs_between_test,
                            cfg.DEBUG.eval_in_train,
                            output_dir,
                            cfg.DEBUG.eval_in_train_per_iter,
                            cfg.TEST.IOU_THRESHOLD,
                            min_loss,
                            eval_aug_thickness=EVAL_AUG_THICKNESS,
                            loss_weights=loss_weights)
        pass

    return model, min_loss