def _build_iter_runner(): model = Model() tmp_dir = tempfile.mkdtemp() runner = IterBasedRunner( model=model, work_dir=tmp_dir, logger=get_logger('demo')) return runner
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=True, drop_last=cfg.data.get('drop_last', False), seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) samples_per_gpu = cfg.data.get('val_samples_per_gpu', cfg.data.samples_per_gpu) workers_per_gpu = cfg.data.get('val_workers_per_gpu', cfg.data.workers_per_gpu) data_loader = build_dataloader(dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=workers_per_gpu, dist=True, shuffle=False) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook(data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def train_segmentor(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Launch segmentor training.""" logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, drop_last=True) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = IterBasedRunner(model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=4, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) ''' runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) ''' runner = IterBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) #runner.run(data_loaders, cfg.workflow, cfg.total_epochs) runner.run(data_loaders, cfg, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = dict( seed=cfg.get('seed'), drop_last=False, dist=True, **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', ] if k in cfg.data)) # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks( cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) if ('val_samples_per_gpu' in cfg.data or 'val_workers_per_gpu' in cfg.data): warnings.warn('"val_samples_per_gpu/val_workers_per_gpu" have ' 'been deprecated. Please use ' '"val_dataloader=dict(samples_per_gpu=1)" instead. ' 'Details see ' 'https://github.com/open-mmlab/mmediting/pull/201') val_loader_cfg = dict( loader_cfg, shuffle=False, drop_last=False, **dict((newk, cfg.data[oldk]) for oldk, newk in [ ('val_samples_per_gpu', 'samples_per_gpu'), ('val_workers_per_gpu', 'workers_per_gpu'), ] if oldk in cfg.data), **cfg.data.get('val_dataloader', {})) data_loader = build_dataloader(dataset, **val_loader_cfg) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook( data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) use_ddp_wrapper = cfg.get('use_ddp_wrapper', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel if use_ddp_wrapper: mmcv.print_log('Use DDP Wrapper.', 'mmgen') model = DistributedDataParallelWrapper( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner if cfg.optimizer: optimizer = build_optimizers(model, cfg.optimizer) # In GANs, we allow building optimizer in GAN model. else: optimizer = None # allow users to define the runner if cfg.get('runner', None): runner = build_runner( cfg.runner, dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) else: runner = IterBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # set if use dynamic ddp in training # is_dynamic_ddp=cfg.get('is_dynamic_ddp', False)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) # In GANs, we can directly optimize parameter in `train_step` function. if cfg.get('optimizer_cfg', None) is None: optimizer_config = None elif fp16_cfg is not None: raise NotImplementedError('Fp16 has not been supported.') # optimizer_config = Fp16OptimizerHook( # **cfg.optimizer_config, **fp16_cfg, distributed=distributed) # default to use OptimizerHook elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # update `out_dir` in ckpt hook if cfg.checkpoint_config is not None: cfg.checkpoint_config['out_dir'] = os.path.join( cfg.work_dir, cfg.checkpoint_config.get('out_dir', 'ckpt')) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # # DistSamplerSeedHook should be used with EpochBasedRunner # if distributed: # runner.register_hook(DistSamplerSeedHook()) # In general, we do NOT adopt standard evaluation hook in GAN training. # Thus, if you want a eval hook, you need further define the key of # 'evaluation' in the config. # register eval hooks if validate and cfg.get('evaluation', None) is not None: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) # Support batch_size > 1 in validation val_loader_cfg = { 'samples_per_gpu': 1, 'shuffle': False, 'workers_per_gpu': cfg.data.workers_per_gpu, **cfg.data.get('val_data_loader', {}) } val_dataloader = build_dataloader(val_dataset, dist=distributed, **val_loader_cfg) eval_cfg = deepcopy(cfg.get('evaluation')) eval_cfg.update(dict(dist=distributed, dataloader=val_dataloader)) eval_hook = build_from_cfg(eval_cfg, HOOKS) priority = eval_cfg.pop('priority', 'NORMAL') runner.register_hook(eval_hook, priority=priority) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def test_precise_bn(): optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) test_dataset = ExampleDataset() loader = DataLoader(test_dataset, batch_size=2) model = ExampleModel() optimizer = build_optimizer(model, optimizer_cfg) logger = get_logger('precise_bn') runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) with pytest.raises(AssertionError): # num_samples must be larger than 0 precise_bn_hook = PreciseBNHook(num_samples=-1) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)]) with pytest.raises(AssertionError): # interval must be larger than 0 precise_bn_hook = PreciseBNHook(interval=0) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)]) with pytest.raises(AssertionError): # interval must be larger than 0 runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) precise_bn_hook = PreciseBNHook(interval=0) runner.register_hook(precise_bn_hook) runner.run([loader], [('train', 1)]) with pytest.raises(AssertionError): # only support EpochBaseRunner runner = IterBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) precise_bn_hook = PreciseBNHook(interval=2) runner.register_hook(precise_bn_hook) print_log(runner) runner.run([loader], [('train', 1)]) # test non-DDP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=4) assert precise_bn_hook.num_samples == 4 assert precise_bn_hook.interval == 1 runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)]) # test DP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=4) assert precise_bn_hook.num_samples == 4 assert precise_bn_hook.interval == 1 model = MMDataParallel(model) runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)]) # test model w/ gn layer loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=4) assert precise_bn_hook.num_samples == 4 assert precise_bn_hook.interval == 1 model = GNExampleModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)]) # test model without bn layer loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=4) assert precise_bn_hook.num_samples == 4 assert precise_bn_hook.interval == 1 model = NoBNExampleModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)]) # test how precise it is loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=12) assert precise_bn_hook.num_samples == 12 assert precise_bn_hook.interval == 1 model = SingleBNModel() runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)]) imgs_list = list() for loader in loaders: for i, data in enumerate(loader): imgs_list.append(np.array(data['imgs'])) mean = np.mean([np.mean(batch) for batch in imgs_list]) # bassel correction used in Pytorch, therefore ddof=1 var = np.mean([np.var(batch, ddof=1) for batch in imgs_list]) assert np.equal(mean, np.array( model.bn.running_mean)), (mean, np.array(model.bn.running_mean)) assert np.equal(var, np.array( model.bn.running_var)), (var, np.array(model.bn.running_var)) @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires CUDA support') def test_ddp_model_precise_bn(): # test DDP model test_bigger_dataset = BiggerDataset() loader = DataLoader(test_bigger_dataset, batch_size=2) loaders = [loader] precise_bn_hook = PreciseBNHook(num_samples=5) assert precise_bn_hook.num_samples == 5 assert precise_bn_hook.interval == 1 model = ExampleModel() model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) runner = EpochBasedRunner(model=model, batch_processor=None, optimizer=optimizer, logger=logger, max_epochs=1) runner.register_hook(precise_bn_hook) runner.run(loaders, [('train', 1)])
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): cfg.data.imgs_per_gpu = int(cfg.data.batch_size) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] if 'use_fp16' in cfg and cfg.use_fp16 == True: raise NotImplementedError('apex do not support non_dist_train!') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config) if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=False, data_loaders=data_loaders) else: common_params = dict(dist_mode=False) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): num_gpus = torch.cuda.device_count() cfg.data.imgs_per_gpu = int(cfg.data.batch_size // num_gpus) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize( model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner if not cfg.get('by_iter', False): runner = Runner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if not cfg.get('by_iter', False): runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)