def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): # build logger if logger is None: logger = get_root_logger(cfg.log_level) # build dataloaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] ''' batch_size = cfg.data.samples_per_gpu num_workers = cfg.data.workers_per_gpu data_loaders = [DataLoader(ds, batch_size=batch_size, sampler=None, shuffle=True, num_workers=num_workers, collate_fn=collate_kitti, pin_memory=False,) for ds in dataset] # TODO change pin_memory ''' if cfg.my_paras.get("enable_ssl", False): data_loaders = [ build_dataloader(dataset[0], 4, cfg.data.workers_per_gpu, dist=distributed) ] data_loaders.append( build_dataloader(dataset[1], 4, cfg.data.workers_per_gpu, dist=distributed)) data_loaders.append( build_dataloader(dataset[2], 4, cfg.data.workers_per_gpu, dist=distributed)) else: data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed) for ds in dataset ] # build optimizer and lr_scheduler total_steps = cfg.total_epochs * len(data_loaders[0]) if cfg.lr_config.type in ["one_cycle", "multi_phase"]: optimizer = build_one_cycle_optimizer(model, cfg.optimizer) lr_scheduler = _create_learning_rate_scheduler( optimizer, cfg.lr_config, total_steps) # todo: will not register lr_hook in trainer cfg.lr_config = None else: # todo: we can add our own optimizer here optimizer = build_optimizer(model, cfg.optimizer) lr_scheduler = None # put model on gpus if distributed: model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: model = model.cuda() logger.info(f"model structure: {model}") model_ema = copy.deepcopy(model) for param in model_ema.parameters(): param.detach_() # build trainer trainer = Trainer(model, model_ema, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level) if distributed: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks trainer.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if distributed: trainer.register_hook(DistSamplerSeedHook()) # training setting if cfg.resume_from: trainer.resume(cfg.resume_from) elif cfg.load_from and cfg.my_paras.enable_ssl: trainer.load_checkpoint_from_scratch(cfg.load_from) trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): if logger is None: logger = get_root_logger(cfg.log_level) # start training # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed) for ds in dataset ] total_steps = cfg.total_epochs * len(data_loaders[0]) # print(f"total_steps: {total_steps}") if cfg.lr_config.type == "one_cycle": # build trainer optimizer = build_one_cycle_optimizer(model, cfg.optimizer) lr_scheduler = _create_learning_rate_scheduler(optimizer, cfg.lr_config, total_steps) cfg.lr_config = None else: optimizer = build_optimizer(model, cfg.optimizer) lr_scheduler = None # put model on gpus if distributed: model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: model = model.cuda() logger.info(f"model structure: {model}") trainer = Trainer(model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level) if distributed: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks trainer.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if distributed: trainer.register_hook(DistSamplerSeedHook()) # # register eval hooks # if validate: # val_dataset_cfg = cfg.data.val # eval_cfg = cfg.get('evaluation', {}) # dataset_type = DATASETS.get(val_dataset_cfg.type) # trainer.register_hook( # KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: trainer.resume(cfg.resume_from) elif cfg.load_from: trainer.load_checkpoint(cfg.load_from) trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)
def train_detector(model, dataset, cfg, distributed=False, validate=False, logger=None): # build logger if logger is None: logger = get_root_logger(cfg.log_level) # build dataloaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] ''' batch_size = cfg.data.samples_per_gpu num_workers = cfg.data.workers_per_gpu data_loaders = [DataLoader(ds, batch_size=batch_size, sampler=None, shuffle=True, num_workers=num_workers, collate_fn=collate_kitti, pin_memory=False,) for ds in dataset] # TODO change pin_memory ''' data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=distributed) for ds in dataset ] # build optimizer and lr_scheduler total_steps = cfg.total_epochs * len(data_loaders[0]) if cfg.lr_config.type == "one_cycle": optimizer = build_one_cycle_optimizer(model, cfg.optimizer) lr_scheduler = _create_learning_rate_scheduler( optimizer, cfg.lr_config, total_steps) # todo: will not register lr_hook in trainer cfg.lr_config = None else: # todo: we can add our own optimizer here optimizer = build_optimizer(model, cfg.optimizer) lr_scheduler = None # put model on gpus if distributed: model = apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel( model.cuda(cfg.local_rank), device_ids=[cfg.local_rank], output_device=cfg.local_rank, # broadcast_buffers=False, find_unused_parameters=True, ) else: model = model.cuda() logger.info(f"model structure: {model}") # build trainer trainer = Trainer(model, batch_processor, optimizer, lr_scheduler, cfg.work_dir, cfg.log_level) if distributed: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks #import ipdb; ipdb.set_trace() trainer.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if distributed: trainer.register_hook(DistSamplerSeedHook()) # # register eval hooks # if validate: # val_dataset_cfg = cfg.data.val # eval_cfg = cfg.get('evaluation', {}) # dataset_type = DATASETS.get(val_dataset_cfg.type) # trainer.register_hook(KittiEvalmAPHookV2(val_dataset_cfg, **eval_cfg)) # training setting if cfg.resume_from: trainer.resume(cfg.resume_from) elif cfg.load_from: trainer.load_checkpoint(cfg.load_from) trainer.run(data_loaders, cfg.workflow, cfg.total_epochs, local_rank=cfg.local_rank)