def _non_dist_train(model, dataset, cfg, validate=False, **kwargs): logger = get_root_logger(cfg.log_level) # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, #batch_processor=batch_processor, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger) #runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_logger(runner, by_epoch, eval_hook_priority): loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook( data_loader, interval=1, by_epoch=by_epoch, save_best='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_logger') optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) runner = EpochBasedRunner( model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_logger_hooks( dict( interval=1, hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)])) runner.register_timer_hook(dict(type='IterTimerHook')) runner.register_hook(eval_hook, priority=eval_hook_priority) runner.run([loader], [('train', 1)], 1) path = osp.join(tmpdir, next(scandir(tmpdir, '.json'))) with open(path) as fr: fr.readline() # skip first line which is hook_msg train_log = json.loads(fr.readline()) assert train_log['mode'] == 'train' and 'time' in train_log val_log = json.loads(fr.readline()) assert val_log['mode'] == 'val' and 'time' not in val_log
def _build_demo_runner(): class Model(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(2, 1) def forward(self, x): return self.linear(x) def train_step(self, x, optimizer, **kwargs): return dict(loss=self(x)) def val_step(self, x, optimizer, **kwargs): return dict(loss=self(x)) model = Model() optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95) log_config = dict( interval=1, hooks=[ dict(type='TextLoggerHook'), ]) tmp_dir = tempfile.mkdtemp() runner = EpochBasedRunner( model=model, work_dir=tmp_dir, optimizer=optimizer, logger=logging.getLogger()) runner.register_logger_hooks(log_config) return runner
def _non_dist_train(model, dataset, cfg, validate=False, logger=None): # prepare data loaders sampler_cfg = cfg.data.get('sampler_cfg', None) sampler = cfg.data.get('sampler', 'Group') data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, sampler=sampler, sampler_cfg=sampler_cfg) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(SamplerSeedHook()) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: load_modules = cfg.get('load_modules', []) load_certain_checkpoint(runner.model, runner.logger, cfg.load_from, load_modules) #runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_save_checkpoint(): model = Model() runner = EpochBasedRunner(model=model, logger=logging.getLogger()) with tempfile.TemporaryDirectory() as root: runner.save_checkpoint(root) latest_path = osp.join(root, 'latest.pth') epoch1_path = osp.join(root, 'epoch_1.pth') assert osp.exists(latest_path) assert osp.exists(epoch1_path) assert osp.realpath(latest_path) == osp.realpath(epoch1_path) torch.load(latest_path)
def test_runner_with_parallel(): def batch_processor(): pass model = MMDataParallel(OldStyleModel()) _ = EpochBasedRunner(model, batch_processor, logger=logging.getLogger()) with pytest.raises(RuntimeError): # batch_processor and train_step() cannot be both set def batch_processor(): pass model = MMDataParallel(Model()) _ = EpochBasedRunner( model, batch_processor, logger=logging.getLogger())
def _build_epoch_runner(): model = Model() tmp_dir = tempfile.mkdtemp() runner = EpochBasedRunner( model=model, work_dir=tmp_dir, logger=get_logger('demo')) return runner
def train_source_detector(model, dataset, cfg, distributed=False, timestamp=None, meta=None): # dataset: [src_dataset] logger = get_root_logger(cfg.log_level) data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build model optimizer optimizer = build_optimizer(model, cfg.optimizer) # build runner runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) runner.timestamp = timestamp # register hooks; no momentum_config; optimizer_config is required by EpochBasedRunner runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # if distributed: # runner.register_hook(DistSamplerSeedHook()) # if cfg.resume_from: # runner.resume(cfg.resume_from) if cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, logger, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_epoch_based_runner(): with pytest.warns(UserWarning): # batch_processor is deprecated model = OldStyleModel() def batch_processor(): pass _ = EpochBasedRunner(model, batch_processor) with pytest.raises(TypeError): # batch_processor must be callable model = OldStyleModel() _ = EpochBasedRunner(model, batch_processor=0) with pytest.raises(AssertionError): # model must implement the method train_step() model = OldStyleModel() _ = EpochBasedRunner(model) with pytest.raises(TypeError): # work_dir must be a str or None model = Model() _ = EpochBasedRunner(model, work_dir=1) with pytest.raises(RuntimeError): # batch_processor and train_step() cannot be both set def batch_processor(): pass model = Model() _ = EpochBasedRunner(model, batch_processor) # test work_dir model = Model() temp_root = tempfile.gettempdir() dir_name = ''.join( [random.choice(string.ascii_letters) for _ in range(10)]) work_dir = osp.join(temp_root, dir_name) _ = EpochBasedRunner(model, work_dir=work_dir) assert osp.isdir(work_dir) _ = EpochBasedRunner(model, work_dir=work_dir) assert osp.isdir(work_dir) os.removedirs(work_dir)
def _build_demo_runner(): class Model(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(2, 1) def forward(self, x): return self.linear(x) def train_step(self, x, optimizer, **kwargs): return dict(loss=self(x)) def val_step(self, x, optimizer, **kwargs): return dict(loss=self(x)) model = Model() tmp_dir = tempfile.mkdtemp() runner = EpochBasedRunner(model=model, work_dir=tmp_dir, logger=logging.getLogger()) return runner
def test_ddp_model_precise_bn(): # test DDP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=5) assert precise_bn_hook.num_iters == 5 assert precise_bn_hook.interval == 1 model = ExampleModel() model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1)
def test_save_checkpoint(): model = Model() runner = EpochBasedRunner(model=model, logger=logging.getLogger()) with pytest.raises(TypeError): # meta should be None or dict runner.save_checkpoint('.', meta=list()) with tempfile.TemporaryDirectory() as root: runner.save_checkpoint(root) latest_path = osp.join(root, 'latest.pth') epoch1_path = osp.join(root, 'epoch_1.pth') assert osp.exists(latest_path) assert osp.exists(epoch1_path) assert osp.realpath(latest_path) == osp.realpath(epoch1_path) torch.load(latest_path)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # if just swa training is performed, # skip building the runner for the traditional training if not cfg.get('only_swa_training', False): # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: # if just swa training is performed, there should be a starting model assert cfg.swa_resume_from is not None or cfg.swa_load_from is not None # perform swa training # build swa training runner if not cfg.get('swa_training', False): return from mmdet.core import SWAHook logger.info('Start SWA training') swa_optimizer = build_optimizer(model, cfg.swa_optimizer) swa_runner = EpochBasedRunner(model, optimizer=swa_optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same swa_runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: swa_optimizer_config = Fp16OptimizerHook(**cfg.swa_optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.swa_optimizer_config: swa_optimizer_config = OptimizerHook(**cfg.swa_optimizer_config) else: swa_optimizer_config = cfg.swa_optimizer_config # register hooks swa_runner.register_training_hooks(cfg.swa_lr_config, swa_optimizer_config, cfg.swa_checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: swa_runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook swa_runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) swa_eval = True swa_eval_hook = eval_hook(val_dataloader, **eval_cfg) else: swa_eval = False swa_eval_hook = None # register swa hook swa_hook = SWAHook(swa_eval=swa_eval, eval_hook=swa_eval_hook) swa_runner.register_hook(swa_hook, priority='LOW') # register user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) swa_runner.register_hook(hook, priority=priority) if cfg.swa_resume_from: swa_runner.resume(cfg.swa_resume_from) elif cfg.swa_load_from: # use the best pretrained model as the starting model for swa training if cfg.swa_load_from == 'best_bbox_mAP.pth': best_model_path = os.path.join(cfg.work_dir, cfg.swa_load_from) assert os.path.exists(best_model_path) # avoid the best pretrained model being overwritten new_best_model_path = os.path.join(cfg.work_dir, 'best_bbox_mAP_pretrained.pth') os.rename(best_model_path, new_best_model_path) cfg.swa_load_from = new_best_model_path swa_runner.load_checkpoint(cfg.swa_load_from) swa_runner.run(data_loaders, cfg.workflow, cfg.swa_total_epochs)
def test_eval_hook(): with pytest.raises(AssertionError): # `save_best` should be a str test_dataset = Model() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best=True) with pytest.raises(TypeError): # dataloader must be a pytorch DataLoader test_dataset = Model() data_loader = [DataLoader(test_dataset)] EvalHook(data_loader) with pytest.raises(ValueError): # key_indicator must be valid when rule_map is None test_dataset = ExampleDataset() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best='unsupport') with pytest.raises(KeyError): # rule must be in keys of rule_map test_dataset = ExampleDataset() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best='auto', rule='unsupport') # if eval_res is an empty dict, print a warning information with pytest.warns(UserWarning) as record_warnings: class _EvalDataset(ExampleDataset): def evaluate(self, results, logger=None): return {} test_dataset = _EvalDataset() data_loader = DataLoader(test_dataset) eval_hook = EvalHook(data_loader, save_best='auto') runner = _build_epoch_runner() runner.register_hook(eval_hook) runner.run([data_loader], [('train', 1)], 1) # Since there will be many warnings thrown, we just need to check if the # expected exceptions are thrown expected_message = ('Since `eval_res` is an empty dict, the behavior to ' 'save the best checkpoint will be skipped in this ' 'evaluation.') for warning in record_warnings: if str(warning.message) == expected_message: break else: assert False test_dataset = ExampleDataset() loader = DataLoader(test_dataset) model = Model() data_loader = DataLoader(test_dataset) eval_hook = EvalHook(data_loader, save_best=None) with tempfile.TemporaryDirectory() as tmpdir: # total_epochs = 1 logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 1) test_dataset.evaluate.assert_called_with(test_dataset, [torch.tensor([1])], logger=runner.logger) assert runner.meta is None or 'best_score' not in runner.meta[ 'hook_msgs'] assert runner.meta is None or 'best_ckpt' not in runner.meta[ 'hook_msgs'] # when `save_best` is set to 'auto', first metric will be used. loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='auto') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best acc and corresponding epoch loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best loss_top and corresponding epoch loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3 # total_epochs = 8, return the best score and corresponding epoch data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='score', rule='greater') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best score using less compare func # and indicate corresponding epoch data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, save_best='acc', rule='less') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3 # Test the EvalHook when resume happened data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, save_best='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 2) old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth') assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path assert osp.exists(old_ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 4 resume_from = old_ckpt_path loader = DataLoader(ExampleDataset()) eval_hook = EvalHook(data_loader, save_best='acc') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.resume(resume_from) assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path assert osp.exists(old_ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 4 runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 assert not osp.exists(old_ckpt_path) # test EvalHook with customer test_fn and greater/less keys loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, save_best='acc', test_fn=mock.MagicMock(return_value={}), greater_keys=[], less_keys=['acc']) with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3 # test EvalHook with specified `out_dir` loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) out_dir = 's3://user/data' eval_hook = EvalHook(data_loader, interval=1, save_best='auto', out_dir=out_dir) with patch.object(PetrelBackend, 'put') as mock_put, \ patch.object(PetrelBackend, 'remove') as mock_remove, \ patch.object(PetrelBackend, 'isfile') as mock_isfile, \ tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) basename = osp.basename(runner.work_dir.rstrip(osp.sep)) ckpt_path = f'{out_dir}/{basename}/best_acc_epoch_4.pth' assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert runner.meta['hook_msgs']['best_score'] == 7 assert mock_put.call_count == 3 assert mock_remove.call_count == 2 assert mock_isfile.call_count == 2
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_eval_hook(): with pytest.raises(TypeError): # dataloader must be a pytorch DataLoader test_dataset = ExampleModel() data_loader = [ DataLoader(test_dataset, batch_size=1, sampler=None, num_worker=0, shuffle=False) ] EvalHook(data_loader) with pytest.raises(ValueError): # when `save_best` is True, `key_indicator` should not be None test_dataset = ExampleModel() data_loader = DataLoader(test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) EvalHook(data_loader, key_indicator=None) with pytest.raises(KeyError): # rule must be in keys of rule_map test_dataset = ExampleModel() data_loader = DataLoader(test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) EvalHook(data_loader, save_best=False, rule='unsupport') with pytest.raises(ValueError): # key_indicator must be valid when rule_map is None test_dataset = ExampleModel() data_loader = DataLoader(test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) EvalHook(data_loader, key_indicator='unsupport') optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) test_dataset = ExampleDataset() loader = DataLoader(test_dataset, batch_size=1) model = ExampleModel() optimizer = build_optimizer(model, optimizer_cfg) data_loader = DataLoader(test_dataset, batch_size=1) eval_hook = EvalHook(data_loader, save_best=False) with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 1) best_json_path = osp.join(tmpdir, 'best.json') assert not osp.exists(best_json_path) loader = DataLoader(EvalDataset(), batch_size=1) model = ExampleModel() data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHook(data_loader, interval=1, save_best=True, key_indicator='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) best_json_path = osp.join(tmpdir, 'best.json') best_json = mmcv.load(best_json_path) real_path = osp.join(tmpdir, 'epoch_4.pth') assert best_json['best_ckpt'] == osp.realpath(real_path) assert best_json['best_score'] == 7 assert best_json['key_indicator'] == 'acc' data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHook(data_loader, interval=1, save_best=True, key_indicator='score', rule='greater') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) best_json_path = osp.join(tmpdir, 'best.json') best_json = mmcv.load(best_json_path) real_path = osp.join(tmpdir, 'epoch_4.pth') assert best_json['best_ckpt'] == osp.realpath(real_path) assert best_json['best_score'] == 7 assert best_json['key_indicator'] == 'score' data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHook(data_loader, rule='less', key_indicator='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) best_json_path = osp.join(tmpdir, 'best.json') best_json = mmcv.load(best_json_path) real_path = osp.join(tmpdir, 'epoch_6.pth') assert best_json['best_ckpt'] == osp.realpath(real_path) assert best_json['best_score'] == -3 assert best_json['key_indicator'] == 'acc' data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHook(data_loader, key_indicator='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 2) best_json_path = osp.join(tmpdir, 'best.json') best_json = mmcv.load(best_json_path) real_path = osp.join(tmpdir, 'epoch_2.pth') assert best_json['best_ckpt'] == osp.realpath(real_path) assert best_json['best_score'] == 4 assert best_json['key_indicator'] == 'acc' resume_from = osp.join(tmpdir, 'latest.pth') loader = DataLoader(ExampleDataset(), batch_size=1) eval_hook = EvalHook(data_loader, key_indicator='acc') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.resume(resume_from) runner.run([loader], [('train', 1)], 8) best_json_path = osp.join(tmpdir, 'best.json') best_json = mmcv.load(best_json_path) real_path = osp.join(tmpdir, 'epoch_4.pth') assert best_json['best_ckpt'] == osp.realpath(real_path) assert best_json['best_score'] == 7 assert best_json['key_indicator'] == 'acc'
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: logger.info("Start using FP16 mixed precision training.") optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config lr_config = get_lr_hook(cfg.lr_config, HOOKS) # register hooks runner.register_training_hooks(lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val) val_dataset.test_mode = True val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_build_lr_momentum_hook(): model = Model() runner = EpochBasedRunner(model=model, logger=logging.getLogger()) # test policy that is already title lr_config = dict( policy='CosineAnnealing', by_epoch=False, min_lr_ratio=0, warmup_iters=2, warmup_ratio=0.9) runner.register_lr_hook(lr_config) assert len(runner.hooks) == 1 # test policy that is already title lr_config = dict( policy='Cyclic', by_epoch=False, target_ratio=(10, 1), cyclic_times=1, step_ratio_up=0.4) runner.register_lr_hook(lr_config) assert len(runner.hooks) == 2 # test policy that is not title lr_config = dict( policy='cyclic', by_epoch=False, target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) runner.register_lr_hook(lr_config) assert len(runner.hooks) == 3 # test policy that is title lr_config = dict( policy='Step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) runner.register_lr_hook(lr_config) assert len(runner.hooks) == 4 # test policy that is not title lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) runner.register_lr_hook(lr_config) assert len(runner.hooks) == 5 # test policy that is already title mom_config = dict( policy='CosineAnnealing', min_momentum_ratio=0.99 / 0.95, by_epoch=False, warmup_iters=2, warmup_ratio=0.9 / 0.95) runner.register_momentum_hook(mom_config) assert len(runner.hooks) == 6 # test policy that is already title mom_config = dict( policy='Cyclic', by_epoch=False, target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) runner.register_momentum_hook(mom_config) assert len(runner.hooks) == 7 # test policy that is already title mom_config = dict( policy='cyclic', by_epoch=False, target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4) runner.register_momentum_hook(mom_config) assert len(runner.hooks) == 8
def test_precise_bn(): with pytest.raises(TypeError): # `data_loader` must be a Pytorch DataLoader test_dataset = ExampleModel() data_loader = DataLoader(test_dataset, batch_size=2, sampler=None, num_workers=0, shuffle=False) PreciseBNHook('data_loader') optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) test_dataset = ExampleDataset() loader = DataLoader(test_dataset, batch_size=2) model = ExampleModel() optimizer = build_optimizer(model, optimizer_cfg) data_loader = DataLoader(test_dataset, batch_size=2) precise_bn_loader = copy.deepcopy(data_loader) logger = get_logger('precise_bn') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) with pytest.raises(AssertionError): # num_iters should be no larget than total # iters precise_bn_hook = PreciseBNHook(precise_bn_loader, num_iters=5) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1) # test non-DDP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=5) assert precise_bn_hook.num_iters == 5 assert precise_bn_hook.interval == 1 runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1) # test model w/ gn layer loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=5) assert precise_bn_hook.num_iters == 5 assert precise_bn_hook.interval == 1 model = GNExampleModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1) # test model without bn layer loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=5) assert precise_bn_hook.num_iters == 5 assert precise_bn_hook.interval == 1 model = NoBNExampleModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1) # test how precise it is loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=6) # run all assert precise_bn_hook.num_iters == 6 assert precise_bn_hook.interval == 1 model = SingleBNModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1) imgs_list = list() for _, data in enumerate(loader): imgs_list.append(np.array(data['imgs'])) mean = np.mean([np.mean(batch) for batch in imgs_list]) # bassel correction used in Pytorch, therefore ddof=1 var = np.mean([np.var(batch, ddof=1) for batch in imgs_list]) assert np.equal(mean, np.array(model.bn.running_mean)) assert np.equal(var, np.array(model.bn.running_var)) @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires CUDA support') def test_ddp_model_precise_bn(): # test DDP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) precise_bn_hook = PreciseBNHook(loader, num_iters=5) assert precise_bn_hook.num_iters == 5 assert precise_bn_hook.interval == 1 model = ExampleModel() model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)], 1)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner print(cfg) print("-"*20) print(cfg.optimizer) optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_reid(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ingored if distributed len(cfg.gpu_ids), dist=distributed, sampler=cfg.data.get('sampler', None), seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Set the `find_unused_parameters` parameter in torch's DDP model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hook if validate: test_dataset = build_dataset(cfg.data.test) test_dataloader = build_dataloader( test_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(test_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got: {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'each item in custom hooks expects dict type, but got: ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) logger.info(f'Register custom hook: {hook_cfg.type}') runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (Dataset): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = { **dict( seed=cfg.get('seed'), drop_last=False, dist=distributed, num_gpus=len(cfg.gpu_ids)), **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', 'persistent_workers', ] if k in cfg.data) } # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # determine whether use adversarial training precess or not use_adverserial_train = cfg.get('use_adversarial_train', False) # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel if use_adverserial_train: # Use DistributedDataParallelWrapper for adversarial training model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizers(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp if use_adverserial_train: # The optimizer step process is included in the train_step function # of the model, so the runner should NOT include optimizer hook. optimizer_config = None else: # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( samples_per_gpu=1, workers_per_gpu=cfg.data.get('workers_per_gpu', 1), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, drop_last=False, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), samples_per_epoch=cfg.data.get('samples_per_epoch', None), dist=distributed, seed=cfg.seed, persistent_workers=cfg.data.get('persistent_workers', False)) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) if find_unused_parameters: logger.info('set find_unused_parameters = True in DDP') # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) optimizer_config = cfg.optimizer_config if 'type' not in cfg.optimizer_config: optimizer_config.type = 'Fp16OptimizerHook' \ if fp16_cfg else 'OptimizerHook' if fp16_cfg: optimizer_config.update(fp16_cfg) if 'Fp16' in optimizer_config.type: optimizer_config.update(distributed=distributed) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, persistent_workers=cfg.data.get('persistent_workers', False)) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def test_eval_hook(): with pytest.raises(AssertionError): # `save_best` should be a str test_dataset = Model() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best=True) with pytest.raises(TypeError): # dataloader must be a pytorch DataLoader test_dataset = Model() data_loader = [DataLoader(test_dataset)] EvalHook(data_loader) with pytest.raises(ValueError): # key_indicator must be valid when rule_map is None test_dataset = ExampleDataset() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best='unsupport') with pytest.raises(KeyError): # rule must be in keys of rule_map test_dataset = Model() data_loader = DataLoader(test_dataset) EvalHook(data_loader, save_best='auto', rule='unsupport') test_dataset = ExampleDataset() loader = DataLoader(test_dataset) model = Model() data_loader = DataLoader(test_dataset) eval_hook = EvalHook(data_loader, save_best=None) with tempfile.TemporaryDirectory() as tmpdir: # total_epochs = 1 logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 1) test_dataset.evaluate.assert_called_with( test_dataset, [torch.tensor([1])], logger=runner.logger) assert runner.meta is None or 'best_score' not in runner.meta[ 'hook_msgs'] assert runner.meta is None or 'best_ckpt' not in runner.meta[ 'hook_msgs'] # when `save_best` is set to 'auto', first metric will be used. loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='auto') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best acc and corresponding epoch loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best loss_top and corresponding epoch loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3 # total_epochs = 8, return the best score and corresponding epoch data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook( data_loader, interval=1, save_best='score', rule='greater') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 # total_epochs = 8, return the best score using less compare func # and indicate corresponding epoch data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, save_best='acc', rule='less') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3 # Test the EvalHook when resume happend data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook(data_loader, save_best='acc') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 2) old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth') assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path assert osp.exists(old_ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 4 resume_from = old_ckpt_path loader = DataLoader(ExampleDataset()) eval_hook = EvalHook(data_loader, save_best='acc') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.resume(resume_from) assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path assert osp.exists(old_ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 4 runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == 7 assert not osp.exists(old_ckpt_path) # test EvalHook with customer test_fn and greater/less keys loader = DataLoader(EvalDataset()) model = Model() data_loader = DataLoader(EvalDataset()) eval_hook = EvalHook( data_loader, save_best='acc', test_fn=mock.MagicMock(return_value={}), greater_keys=[], less_keys=['acc']) with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth') assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path assert osp.exists(ckpt_path) assert runner.meta['hook_msgs']['best_score'] == -3
]) trainset = CIFAR10(root='data', train=True, download=True, transform=transform) trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) logger = get_logger('mmcv') # runner is a scheduler to manage the training runner = EpochBasedRunner(model, optimizer=optimizer, work_dir='./work_dir', logger=logger, max_epochs=4) # learning rate scheduler config lr_config = dict(policy='step', step=[2, 3]) # configuration of optimizer optimizer_config = dict(grad_clip=None) # configuration of saving checkpoints periodically checkpoint_config = dict(interval=1) # save log periodically and multiple hooks can be used simultaneously log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')]) # register hooks to runner and those hooks will be invoked automatically runner.register_training_hooks(lr_config=lr_config, optimizer_config=optimizer_config, checkpoint_config=checkpoint_config,
def test_eval_hook(EvalHookCls): with pytest.raises(TypeError): # dataloader must be a pytorch DataLoader test_dataset = ExampleDataset() data_loader = [ DataLoader(test_dataset, batch_size=1, sampler=None, num_worker=0, shuffle=False) ] EvalHookCls(data_loader) with pytest.raises(KeyError): # rule must be in keys of rule_map test_dataset = ExampleDataset() data_loader = DataLoader(test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) EvalHookCls(data_loader, save_best='auto', rule='unsupport') with pytest.raises(ValueError): # key_indicator must be valid when rule_map is None test_dataset = ExampleDataset() data_loader = DataLoader(test_dataset, batch_size=1, sampler=None, num_workers=0, shuffle=False) EvalHookCls(data_loader, save_best='unsupport') optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) test_dataset = ExampleDataset() loader = DataLoader(test_dataset, batch_size=1) model = ExampleModel() optimizer = build_optimizer(model, optimizer_cfg) data_loader = DataLoader(test_dataset, batch_size=1) eval_hook = EvalHookCls(data_loader, save_best=None) with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 1) assert runner.meta is None or 'best_score' not in runner.meta[ 'hook_msgs'] assert runner.meta is None or 'best_ckpt' not in runner.meta[ 'hook_msgs'] # when `save_best` is set to 'auto', first metric will be used. loader = DataLoader(EvalDataset(), batch_size=1) model = ExampleModel() data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, interval=1, save_best='auto') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) real_path = osp.join(tmpdir, 'epoch_4.pth') link_path = osp.join(tmpdir, 'best_mAP.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.7 loader = DataLoader(EvalDataset(), batch_size=1) model = ExampleModel() data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, interval=1, save_best='mAP') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) real_path = osp.join(tmpdir, 'epoch_4.pth') link_path = osp.join(tmpdir, 'best_mAP.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.7 data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, interval=1, save_best='score', rule='greater') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) real_path = osp.join(tmpdir, 'epoch_4.pth') link_path = osp.join(tmpdir, 'best_score.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.7 data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, save_best='mAP', rule='less') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 8) real_path = osp.join(tmpdir, 'epoch_6.pth') link_path = osp.join(tmpdir, 'best_mAP.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.05 data_loader = DataLoader(EvalDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, save_best='mAP') with tempfile.TemporaryDirectory() as tmpdir: logger = get_logger('test_eval') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.run([loader], [('train', 1)], 2) real_path = osp.join(tmpdir, 'epoch_2.pth') link_path = osp.join(tmpdir, 'best_mAP.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.4 resume_from = osp.join(tmpdir, 'latest.pth') loader = DataLoader(ExampleDataset(), batch_size=1) eval_hook = EvalHookCls(data_loader, save_best='mAP') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=tmpdir, logger=logger) runner.register_checkpoint_hook(dict(interval=1)) runner.register_hook(eval_hook) runner.resume(resume_from) runner.run([loader], [('train', 1)], 8) real_path = osp.join(tmpdir, 'epoch_4.pth') link_path = osp.join(tmpdir, 'best_mAP.pth') assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path) assert osp.exists(link_path) assert runner.meta['hook_msgs']['best_score'] == 0.7
def test_epoch_based_runner(): with pytest.warns(UserWarning): # batch_processor is deprecated model = OldStyleModel() def batch_processor(): pass _ = EpochBasedRunner( model, batch_processor, logger=logging.getLogger()) with pytest.raises(TypeError): # batch_processor must be callable model = OldStyleModel() _ = EpochBasedRunner( model, batch_processor=0, logger=logging.getLogger()) with pytest.raises(TypeError): # optimizer must be a optimizer or a dict of optimizers model = Model() optimizer = 'NotAOptimizer' _ = EpochBasedRunner( model, optimizer=optimizer, logger=logging.getLogger()) with pytest.raises(TypeError): # optimizer must be a optimizer or a dict of optimizers model = Model() optimizers = dict(optim1=torch.optim.Adam(), optim2='NotAOptimizer') _ = EpochBasedRunner( model, optimizer=optimizers, logger=logging.getLogger()) with pytest.raises(TypeError): # logger must be a logging.Logger model = Model() _ = EpochBasedRunner(model, logger=None) with pytest.raises(TypeError): # meta must be a dict or None model = Model() _ = EpochBasedRunner(model, logger=logging.getLogger(), meta=['list']) with pytest.raises(AssertionError): # model must implement the method train_step() model = OldStyleModel() _ = EpochBasedRunner(model, logger=logging.getLogger()) with pytest.raises(TypeError): # work_dir must be a str or None model = Model() _ = EpochBasedRunner(model, work_dir=1, logger=logging.getLogger()) with pytest.raises(RuntimeError): # batch_processor and train_step() cannot be both set def batch_processor(): pass model = Model() _ = EpochBasedRunner( model, batch_processor, logger=logging.getLogger()) # test work_dir model = Model() temp_root = tempfile.gettempdir() dir_name = ''.join( [random.choice(string.ascii_letters) for _ in range(10)]) work_dir = osp.join(temp_root, dir_name) _ = EpochBasedRunner(model, work_dir=work_dir, logger=logging.getLogger()) assert osp.isdir(work_dir) _ = EpochBasedRunner(model, work_dir=work_dir, logger=logging.getLogger()) assert osp.isdir(work_dir) os.removedirs(work_dir)