def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, dist=True, drop_last=cfg.data.get('drop_last', False), seed=cfg.seed) for ds in dataset ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) samples_per_gpu = cfg.data.get('val_samples_per_gpu', cfg.data.samples_per_gpu) workers_per_gpu = cfg.data.get('val_workers_per_gpu', cfg.data.workers_per_gpu) data_loader = build_dataloader(dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=workers_per_gpu, dist=True, shuffle=False) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook(data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def _dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): """Distributed training function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. validate (bool): Whether to do evaluation. Default: False. logger (logging.Logger | None): Logger for training. Default: None. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None. """ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = dict( seed=cfg.get('seed'), drop_last=False, dist=True, **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', ] if k in cfg.data)) # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizers(model, cfg.optimizers) runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register hooks runner.register_training_hooks( cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if validate and cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.val) if ('val_samples_per_gpu' in cfg.data or 'val_workers_per_gpu' in cfg.data): warnings.warn('"val_samples_per_gpu/val_workers_per_gpu" have ' 'been deprecated. Please use ' '"val_dataloader=dict(samples_per_gpu=1)" instead. ' 'Details see ' 'https://github.com/open-mmlab/mmediting/pull/201') val_loader_cfg = dict( loader_cfg, shuffle=False, drop_last=False, **dict((newk, cfg.data[oldk]) for oldk, newk in [ ('val_samples_per_gpu', 'samples_per_gpu'), ('val_workers_per_gpu', 'workers_per_gpu'), ] if oldk in cfg.data), **cfg.data.get('val_dataloader', {})) data_loader = build_dataloader(dataset, **val_loader_cfg) save_path = osp.join(cfg.work_dir, 'val_visuals') runner.register_hook( DistEvalIterHook( data_loader, save_path=save_path, **cfg.evaluation)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def test_two_stage_inpaintor(): model = dict( disc_input_with_mask=True, encdec=dict(type='DeepFillEncoderDecoder', stage1=dict(type='GLEncoderDecoder', encoder=dict(type='DeepFillEncoder', conv_type='gated_conv', channel_factor=0.75), decoder=dict(type='DeepFillDecoder', conv_type='gated_conv', in_channels=96, channel_factor=0.75), dilation_neck=dict(type='GLDilationNeck', in_channels=96, conv_type='gated_conv', act_cfg=dict(type='ELU'))), stage2=dict(type='DeepFillRefiner', encoder_attention=dict( type='DeepFillEncoder', encoder_type='stage2_attention', conv_type='gated_conv', channel_factor=0.75), encoder_conv=dict(type='DeepFillEncoder', encoder_type='stage2_conv', conv_type='gated_conv', channel_factor=0.75), dilation_neck=dict(type='GLDilationNeck', in_channels=96, conv_type='gated_conv', act_cfg=dict(type='ELU')), contextual_attention=dict( type='ContextualAttentionNeck', in_channels=96, conv_type='gated_conv'), decoder=dict(type='DeepFillDecoder', in_channels=192, conv_type='gated_conv'))), disc=dict( type='MultiLayerDiscriminator', in_channels=4, max_channels=256, fc_in_channels=256 * 4 * 4, fc_out_channels=1, num_convs=6, norm_cfg=None, act_cfg=dict(type='ELU'), out_act_cfg=dict(type='LeakyReLU', negative_slope=0.2), with_spectral_norm=True, ), stage1_loss_type=('loss_l1_hole', 'loss_l1_valid'), stage2_loss_type=('loss_l1_hole', 'loss_l1_valid', 'loss_gan'), loss_gan=dict( type='GANLoss', gan_type='hinge', loss_weight=1, ), loss_l1_hole=dict( type='L1Loss', loss_weight=1.0, ), loss_l1_valid=dict( type='L1Loss', loss_weight=1.0, ), pretrained=None) train_cfg = Config(dict(disc_step=1)) test_cfg = Config(dict(metrics=['l1', 'psnr', 'ssim'])) tsinpaintor = TwoStageInpaintor(**model, train_cfg=train_cfg, test_cfg=test_cfg) # check architecture assert tsinpaintor.stage1_loss_type == ('loss_l1_hole', 'loss_l1_valid') assert tsinpaintor.stage2_loss_type == ('loss_l1_hole', 'loss_l1_valid', 'loss_gan') assert tsinpaintor.with_l1_hole_loss assert tsinpaintor.with_l1_valid_loss assert not tsinpaintor.with_composed_percep_loss assert not tsinpaintor.with_out_percep_loss assert tsinpaintor.with_gan if torch.cuda.is_available(): # prepare data gt_img = torch.rand((2, 3, 256, 256)).cuda() mask = torch.zeros((2, 1, 256, 256)).cuda() mask[..., 50:180, 60:170] = 1. masked_img = gt_img * (1. - mask) data_batch = dict(gt_img=gt_img, mask=mask, masked_img=masked_img) # prepare model and optimizer tsinpaintor.cuda() optimizers_config = dict(generator=dict(type='Adam', lr=0.0001), disc=dict(type='Adam', lr=0.0001)) optims = build_optimizers(tsinpaintor, optimizers_config) # check train_step with standard deepfillv2 model outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' in log_vars assert 'stage1_loss_l1_hole' in log_vars assert 'stage1_loss_l1_valid' in log_vars assert 'stage2_loss_l1_hole' in log_vars assert 'stage2_loss_l1_valid' in log_vars assert 'stage1_fake_res' in outputs['results'] assert 'stage2_fake_res' in outputs['results'] assert outputs['results']['stage1_fake_res'].size() == (2, 3, 256, 256) # check train step w/o disc step tsinpaintor.train_cfg.disc_step = 0 outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' not in log_vars assert 'stage1_loss_l1_hole' in log_vars assert 'stage1_loss_l1_valid' in log_vars assert 'stage2_loss_l1_hole' in log_vars assert 'stage2_loss_l1_valid' in log_vars assert 'stage1_fake_res' in outputs['results'] assert 'stage2_fake_res' in outputs['results'] assert outputs['results']['stage1_fake_res'].size() == (2, 3, 256, 256) tsinpaintor.train_cfg.disc_step = 1 # check train step w/ multiple disc step tsinpaintor.train_cfg.disc_step = 5 outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' in log_vars assert 'stage1_loss_l1_hole' not in log_vars assert outputs['results']['fake_res'].size() == (2, 3, 256, 256) tsinpaintor.train_cfg.disc_step = 1 # test forward test w/o save image outputs = tsinpaintor.forward_test(masked_img[0:1], mask[0:1], gt_img=gt_img[0:1, ...]) assert 'eval_results' in outputs assert outputs['eval_results']['l1'] > 0 assert outputs['eval_results']['psnr'] > 0 assert outputs['eval_results']['ssim'] > 0 # test forward test w/o eval metrics tsinpaintor.test_cfg = dict() tsinpaintor.eval_with_metrics = False outputs = tsinpaintor.forward_test(masked_img[0:1], mask[0:1]) for key in [ 'stage1_fake_res', 'stage2_fake_res', 'fake_res', 'fake_img' ]: assert outputs[key].size() == (1, 3, 256, 256) # test forward test w/ save image with tempfile.TemporaryDirectory() as tmpdir: outputs = tsinpaintor.forward_test( masked_img[0:1], mask[0:1], save_image=True, save_path=tmpdir, iteration=4396, meta=[dict(gt_img_path='igccc.png')]) assert os.path.exists(os.path.join(tmpdir, 'igccc_4396.png')) # test forward test w/ save image w/ gt_img with tempfile.TemporaryDirectory() as tmpdir: outputs = tsinpaintor.forward_test( masked_img[0:1], mask[0:1], save_image=True, save_path=tmpdir, meta=[dict(gt_img_path='igccc.png')], gt_img=gt_img[0:1, ...]) assert os.path.exists(os.path.join(tmpdir, 'igccc.png')) with pytest.raises(AssertionError): outputs = tsinpaintor.forward_test(masked_img[0:1], mask[0:1], save_image=True, save_path=tmpdir, iteration=4396, gt_img=gt_img[0:1, ...]) with pytest.raises(AssertionError): outputs = tsinpaintor.forward_test( masked_img[0:1], mask[0:1], save_image=True, save_path=None, iteration=4396, meta=[dict(gt_img_path='igccc.png')], gt_img=gt_img[0:1, ...]) # check train_step with not implemented loss type with pytest.raises(NotImplementedError): model_ = copy.deepcopy(model) model_['stage1_loss_type'] = ('igccc', ) tsinpaintor = TwoStageInpaintor(**model_, train_cfg=train_cfg, test_cfg=test_cfg).cuda() outputs = tsinpaintor.train_step(data_batch, optims) # test input w/o ones and disc input w/o mask model_ = dict( disc_input_with_mask=False, input_with_ones=False, encdec=dict( type='DeepFillEncoderDecoder', stage1=dict(type='GLEncoderDecoder', encoder=dict(type='DeepFillEncoder', in_channels=4, conv_type='gated_conv', channel_factor=0.75), decoder=dict(type='DeepFillDecoder', conv_type='gated_conv', in_channels=96, channel_factor=0.75), dilation_neck=dict(type='GLDilationNeck', in_channels=96, conv_type='gated_conv', act_cfg=dict(type='ELU'))), stage2=dict( type='DeepFillRefiner', encoder_attention=dict(type='DeepFillEncoder', in_channels=4, encoder_type='stage2_attention', conv_type='gated_conv', channel_factor=0.75), encoder_conv=dict(type='DeepFillEncoder', in_channels=4, encoder_type='stage2_conv', conv_type='gated_conv', channel_factor=0.75), dilation_neck=dict(type='GLDilationNeck', in_channels=96, conv_type='gated_conv', act_cfg=dict(type='ELU')), contextual_attention=dict(type='ContextualAttentionNeck', in_channels=96, conv_type='gated_conv'), decoder=dict(type='DeepFillDecoder', in_channels=192, conv_type='gated_conv'))), disc=dict( type='MultiLayerDiscriminator', in_channels=3, max_channels=256, fc_in_channels=256 * 4 * 4, fc_out_channels=1, num_convs=6, norm_cfg=None, act_cfg=dict(type='ELU'), out_act_cfg=dict(type='LeakyReLU', negative_slope=0.2), with_spectral_norm=True, ), stage1_loss_type=('loss_l1_hole', 'loss_l1_valid'), stage2_loss_type=('loss_l1_hole', 'loss_l1_valid', 'loss_gan'), loss_gan=dict( type='GANLoss', gan_type='hinge', loss_weight=1, ), loss_l1_hole=dict( type='L1Loss', loss_weight=1.0, ), loss_gp=dict(type='GradientPenaltyLoss', loss_weight=10.), loss_tv=dict( type='MaskedTVLoss', loss_weight=0.1, ), loss_l1_valid=dict( type='L1Loss', loss_weight=1.0, ), pretrained=None) tsinpaintor = TwoStageInpaintor(**model_, train_cfg=train_cfg, test_cfg=test_cfg).cuda() outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' in log_vars assert 'stage1_loss_l1_hole' in log_vars assert 'stage1_loss_l1_valid' in log_vars assert 'stage2_loss_l1_hole' in log_vars assert 'stage2_loss_l1_valid' in log_vars assert 'stage1_fake_res' in outputs['results'] assert 'stage2_fake_res' in outputs['results'] assert outputs['results']['stage1_fake_res'].size() == (2, 3, 256, 256) outputs = tsinpaintor.forward_test(masked_img[0:1], mask[0:1], gt_img=gt_img[0:1, ...]) assert 'eval_results' in outputs assert outputs['eval_results']['l1'] > 0 # test w/o stage1 loss model_ = copy.deepcopy(model) model_['stage1_loss_type'] = None tsinpaintor = TwoStageInpaintor(**model_, train_cfg=train_cfg, test_cfg=test_cfg).cuda() outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' in log_vars assert 'stage1_loss_l1_hole' not in log_vars assert 'stage1_loss_l1_valid' not in log_vars assert 'stage2_loss_l1_hole' in log_vars assert 'stage2_loss_l1_valid' in log_vars assert 'stage1_fake_res' in outputs['results'] assert 'stage2_fake_res' in outputs['results'] assert outputs['results']['stage1_fake_res'].size() == (2, 3, 256, 256) # test w/o stage2 loss model_ = copy.deepcopy(model) model_['stage2_loss_type'] = None tsinpaintor = TwoStageInpaintor(**model_, train_cfg=train_cfg, test_cfg=test_cfg).cuda() outputs = tsinpaintor.train_step(data_batch, optims) assert outputs['num_samples'] == 2 log_vars = outputs['log_vars'] assert 'real_loss' in log_vars assert 'stage1_loss_l1_hole' in log_vars assert 'stage1_loss_l1_valid' in log_vars assert 'stage2_loss_l1_hole' not in log_vars assert 'stage2_loss_l1_valid' not in log_vars assert 'stage1_fake_res' in outputs['results'] assert 'stage2_fake_res' in outputs['results'] assert outputs['results']['stage1_fake_res'].size() == (2, 3, 256, 256)
def test_build_optimizers(): base_lr = 0.0001 base_wd = 0.0002 momentum = 0.9 # basic config with ExampleModel optimizer_cfg = dict(model1=dict(type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum), model2=dict(type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)) model = ExampleModel() optimizers = build_optimizers(model, optimizer_cfg) param_dict = dict(model.named_parameters()) assert isinstance(optimizers, dict) for i in range(2): optimizer = optimizers[f'model{i+1}'] param_groups = optimizer.param_groups[0] assert isinstance(optimizer, torch.optim.SGD) assert optimizer.defaults['lr'] == base_lr assert optimizer.defaults['momentum'] == momentum assert optimizer.defaults['weight_decay'] == base_wd assert len(param_groups['params']) == 2 assert torch.equal(param_groups['params'][0], param_dict[f'model{i+1}.weight']) assert torch.equal(param_groups['params'][1], param_dict[f'model{i+1}.bias']) # basic config with Parallel model model = torch.nn.DataParallel(ExampleModel()) optimizers = build_optimizers(model, optimizer_cfg) param_dict = dict(model.named_parameters()) assert isinstance(optimizers, dict) for i in range(2): optimizer = optimizers[f'model{i+1}'] param_groups = optimizer.param_groups[0] assert isinstance(optimizer, torch.optim.SGD) assert optimizer.defaults['lr'] == base_lr assert optimizer.defaults['momentum'] == momentum assert optimizer.defaults['weight_decay'] == base_wd assert len(param_groups['params']) == 2 assert torch.equal(param_groups['params'][0], param_dict[f'module.model{i+1}.weight']) assert torch.equal(param_groups['params'][1], param_dict[f'module.model{i+1}.bias']) # basic config with ExampleModel (one optimizer) optimizer_cfg = dict(type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum) model = ExampleModel() optimizer = build_optimizers(model, optimizer_cfg) param_dict = dict(model.named_parameters()) assert isinstance(optimizers, dict) param_groups = optimizer.param_groups[0] assert isinstance(optimizer, torch.optim.SGD) assert optimizer.defaults['lr'] == base_lr assert optimizer.defaults['momentum'] == momentum assert optimizer.defaults['weight_decay'] == base_wd assert len(param_groups['params']) == 4 assert torch.equal(param_groups['params'][0], param_dict['model1.weight']) assert torch.equal(param_groups['params'][1], param_dict['model1.bias']) assert torch.equal(param_groups['params'][2], param_dict['model2.weight']) assert torch.equal(param_groups['params'][3], param_dict['model2.bias']) # basic config with Parallel model (one optimizer) model = torch.nn.DataParallel(ExampleModel()) optimizer = build_optimizers(model, optimizer_cfg) param_dict = dict(model.named_parameters()) assert isinstance(optimizers, dict) param_groups = optimizer.param_groups[0] assert isinstance(optimizer, torch.optim.SGD) assert optimizer.defaults['lr'] == base_lr assert optimizer.defaults['momentum'] == momentum assert optimizer.defaults['weight_decay'] == base_wd assert len(param_groups['params']) == 4 assert torch.equal(param_groups['params'][0], param_dict['module.model1.weight']) assert torch.equal(param_groups['params'][1], param_dict['module.model1.bias']) assert torch.equal(param_groups['params'][2], param_dict['module.model2.weight']) assert torch.equal(param_groups['params'][3], param_dict['module.model2.bias'])