Ejemplo n.º 1
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   logger=None):
    # build logger
    if logger is None:
        logger = get_root_logger(cfg.log_level)

    # build dataloaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    '''
    batch_size = cfg.data.samples_per_gpu
    num_workers = cfg.data.workers_per_gpu
    data_loaders = [DataLoader(ds, batch_size=batch_size, sampler=None, shuffle=True, num_workers=num_workers, collate_fn=collate_kitti, pin_memory=False,) for ds in dataset]  # TODO change pin_memory
    '''
    if cfg.my_paras.get("enable_ssl", False):
        data_loaders = [
            build_dataloader(dataset[0],
                             4,
                             cfg.data.workers_per_gpu,
                             dist=distributed)
        ]
        data_loaders.append(
            build_dataloader(dataset[1],
                             4,
                             cfg.data.workers_per_gpu,
                             dist=distributed))
        data_loaders.append(
            build_dataloader(dataset[2],
                             4,
                             cfg.data.workers_per_gpu,
                             dist=distributed))
    else:
        data_loaders = [
            build_dataloader(ds,
                             cfg.data.samples_per_gpu,
                             cfg.data.workers_per_gpu,
                             dist=distributed) for ds in dataset
        ]

    # build optimizer and lr_scheduler
    total_steps = cfg.total_epochs * len(data_loaders[0])
    if cfg.lr_config.type in ["one_cycle", "multi_phase"]:
        optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
        lr_scheduler = _create_learning_rate_scheduler(
            optimizer, cfg.lr_config,
            total_steps)  # todo: will not register lr_hook in trainer
        cfg.lr_config = None
    else:  # todo: we can add our own optimizer here
        optimizer = build_optimizer(model, cfg.optimizer)
        lr_scheduler = None

    # put model on gpus
    if distributed:
        model = apex.parallel.convert_syncbn_model(model)
        model = DistributedDataParallel(
            model.cuda(cfg.local_rank),
            device_ids=[cfg.local_rank],
            output_device=cfg.local_rank,
            # broadcast_buffers=False,
            find_unused_parameters=True,
        )
    else:
        model = model.cuda()
    logger.info(f"model structure: {model}")

    model_ema = copy.deepcopy(model)
    for param in model_ema.parameters():
        param.detach_()

    # build trainer
    trainer = Trainer(model, model_ema, batch_processor, optimizer,
                      lr_scheduler, cfg.work_dir, cfg.log_level)

    if distributed:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    trainer.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.log_config)

    if distributed:
        trainer.register_hook(DistSamplerSeedHook())

    # training setting
    if cfg.resume_from:
        trainer.resume(cfg.resume_from)
    elif cfg.load_from and cfg.my_paras.enable_ssl:
        trainer.load_checkpoint_from_scratch(cfg.load_from)

    trainer.run(data_loaders,
                cfg.workflow,
                cfg.total_epochs,
                local_rank=cfg.local_rank)
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   logger=None):
    if logger is None:
        logger = get_root_logger(cfg.log_level)

    # start training
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.samples_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=distributed) for ds in dataset
    ]

    total_steps = cfg.total_epochs * len(data_loaders[0])
    # print(f"total_steps: {total_steps}")

    if cfg.lr_config.type == "one_cycle":
        # build trainer
        optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
        lr_scheduler = _create_learning_rate_scheduler(optimizer,
                                                       cfg.lr_config,
                                                       total_steps)
        cfg.lr_config = None
    else:
        optimizer = build_optimizer(model, cfg.optimizer)
        lr_scheduler = None

    # put model on gpus
    if distributed:
        model = apex.parallel.convert_syncbn_model(model)
        model = DistributedDataParallel(
            model.cuda(cfg.local_rank),
            device_ids=[cfg.local_rank],
            output_device=cfg.local_rank,
            # broadcast_buffers=False,
            find_unused_parameters=True,
        )
    else:
        model = model.cuda()

    logger.info(f"model structure: {model}")

    trainer = Trainer(model, batch_processor, optimizer, lr_scheduler,
                      cfg.work_dir, cfg.log_level)

    if distributed:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    trainer.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.log_config)

    if distributed:
        trainer.register_hook(DistSamplerSeedHook())

    # # register eval hooks
    # if validate:
    #     val_dataset_cfg = cfg.data.val
    #     eval_cfg = cfg.get('evaluation', {})
    #     dataset_type = DATASETS.get(val_dataset_cfg.type)
    #     trainer.register_hook(
    #         KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        trainer.resume(cfg.resume_from)
    elif cfg.load_from:
        trainer.load_checkpoint(cfg.load_from)

    trainer.run(data_loaders,
                cfg.workflow,
                cfg.total_epochs,
                local_rank=cfg.local_rank)
Ejemplo n.º 3
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   logger=None):
    # build logger
    if logger is None:
        logger = get_root_logger(cfg.log_level)

    # build dataloaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    '''
    batch_size = cfg.data.samples_per_gpu
    num_workers = cfg.data.workers_per_gpu
    data_loaders = [DataLoader(ds, batch_size=batch_size, sampler=None, shuffle=True, num_workers=num_workers, collate_fn=collate_kitti, pin_memory=False,) for ds in dataset]  # TODO change pin_memory
    '''
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.samples_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=distributed) for ds in dataset
    ]

    # build optimizer and lr_scheduler
    total_steps = cfg.total_epochs * len(data_loaders[0])
    if cfg.lr_config.type == "one_cycle":
        optimizer = build_one_cycle_optimizer(model, cfg.optimizer)
        lr_scheduler = _create_learning_rate_scheduler(
            optimizer, cfg.lr_config,
            total_steps)  # todo: will not register lr_hook in trainer
        cfg.lr_config = None
    else:  # todo: we can add our own optimizer here
        optimizer = build_optimizer(model, cfg.optimizer)
        lr_scheduler = None

    # put model on gpus
    if distributed:
        model = apex.parallel.convert_syncbn_model(model)
        model = DistributedDataParallel(
            model.cuda(cfg.local_rank),
            device_ids=[cfg.local_rank],
            output_device=cfg.local_rank,
            # broadcast_buffers=False,
            find_unused_parameters=True,
        )
    else:
        model = model.cuda()
    logger.info(f"model structure: {model}")

    # build trainer
    trainer = Trainer(model, batch_processor, optimizer, lr_scheduler,
                      cfg.work_dir, cfg.log_level)

    if distributed:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    #import ipdb; ipdb.set_trace()
    trainer.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.log_config)

    if distributed:
        trainer.register_hook(DistSamplerSeedHook())

    # # register eval hooks
    # if validate:
    #     val_dataset_cfg = cfg.data.val
    #     eval_cfg = cfg.get('evaluation', {})
    #     dataset_type = DATASETS.get(val_dataset_cfg.type)
    #     trainer.register_hook(KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg))

    # training setting
    if cfg.resume_from:
        trainer.resume(cfg.resume_from)
    elif cfg.load_from:
        trainer.load_checkpoint(cfg.load_from)

    trainer.run(data_loaders,
                cfg.workflow,
                cfg.total_epochs,
                local_rank=cfg.local_rank)