def _non_dist_train(model, dataset, cfg, validate=False, **kwargs): logger = get_root_logger(cfg.log_level) # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, #batch_processor=batch_processor, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger) #runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False, **kwargs): logger = get_root_logger(cfg.log_level) # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDistributedDataParallel(model.cuda(), device_ids=[torch.cuda.current_device()]) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, #batch_processor=batch_processor, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger) #runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # logger) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(val_dataset_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook(CocoDistEvalmAPHook(val_dataset_cfg)) else: runner.register_hook(DistEvalmAPHook(val_dataset_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_source_detector(model, dataset, cfg, distributed=False, timestamp=None, meta=None): # dataset: [src_dataset] logger = get_root_logger(cfg.log_level) data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build model optimizer optimizer = build_optimizer(model, cfg.optimizer) # build runner runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) runner.timestamp = timestamp # register hooks; no momentum_config; optimizer_config is required by EpochBasedRunner runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # if distributed: # runner.register_hook(DistSamplerSeedHook()) # if cfg.resume_from: # runner.resume(cfg.resume_from) if cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus find_unused_parameters = cfg.get('find_unused_parameters', False) # Start: vj changes find_unused_parameters = True # End: vj chagnes model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) # register hooks optimizer_config = DistOptimizerHook(**cfg.optimizer_config) runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']: runner.register_hook( DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) if cfg.data.val.type == 'AVADataset': runner.register_hook(AVADistEvalmAPHook(cfg.data.val)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, logger, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(cfg.optimizer) runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) dataloader_setting = dict(dataloader_setting, **cfg.data.get('train_dataloader', {})) data_loaders = [ build_dataloader(ds, **dataloader_setting) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( videos_per_gpu=cfg.data.get('videos_per_gpu', {}), workers_per_gpu=cfg.data.get('workers_per_gpu', {}), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (:obj:`Dataset`): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), samples_per_epoch=cfg.data.get('samples_per_epoch', None), dist=distributed, seed=cfg.seed, persistent_workers=cfg.data.get('persistent_workers', False)) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) if find_unused_parameters: logger.info('set find_unused_parameters = True in DDP') # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) optimizer_config = cfg.optimizer_config if 'type' not in cfg.optimizer_config: optimizer_config.type = 'Fp16OptimizerHook' \ if fp16_cfg else 'OptimizerHook' if fp16_cfg: optimizer_config.update(fp16_cfg) if 'Fp16' in optimizer_config.type: optimizer_config.update(distributed=distributed) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, persistent_workers=cfg.data.get('persistent_workers', False)) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_reid(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger() # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ingored if distributed len(cfg.gpu_ids), dist=distributed, sampler=cfg.data.get('sampler', None), seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Set the `find_unused_parameters` parameter in torch's DDP model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) runner.timestamp = timestamp # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hook if validate: test_dataset = build_dataset(cfg.data.test) test_dataloader = build_dataloader( test_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) runner.register_hook(EvalHook(test_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got: {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'each item in custom hooks expects dict type, but got: ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) logger.info(f'Register custom hook: {hook_cfg.type}') runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, round_up=True, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: logger.info("Start using FP16 mixed precision training.") optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config lr_config = get_lr_hook(cfg.lr_config, HOOKS) # register hooks runner.register_training_hooks(lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val) val_dataset.test_mode = True val_dataloader = build_dataloader( val_dataset, samples_per_gpu=cfg.data.samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, round_up=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, timestamp=None, meta=None): # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): cfg.data.imgs_per_gpu = int(cfg.data.batch_size) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] if 'use_fp16' in cfg and cfg.use_fp16 == True: raise NotImplementedError('apex do not support non_dist_train!') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config) if not cfg.get('by_iter', False): runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=False, data_loaders=data_loaders) else: common_params = dict(dist_mode=False) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Train model entry function. Args: model (nn.Module): The model to be trained. dataset (Dataset): Train dataset. cfg (dict): The config dict for training. distributed (bool): Whether to use distributed training. Default: False. validate (bool): Whether to do evaluation. Default: False. timestamp (str | None): Local time for runner. Default: None. meta (dict | None): Meta dict to record some important information. Default: None """ logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # step 1: give default values and override (if exist) from cfg.data loader_cfg = { **dict( seed=cfg.get('seed'), drop_last=False, dist=distributed, num_gpus=len(cfg.gpu_ids)), **({} if torch.__version__ != 'parrots' else dict( prefetch_num=2, pin_memory=False, )), **dict((k, cfg.data[k]) for k in [ 'samples_per_gpu', 'workers_per_gpu', 'shuffle', 'seed', 'drop_last', 'prefetch_num', 'pin_memory', 'persistent_workers', ] if k in cfg.data) } # step 2: cfg.data.train_dataloader has highest priority train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {})) data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset] # determine whether use adversarial training precess or not use_adverserial_train = cfg.get('use_adversarial_train', False) # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel if use_adverserial_train: # Use DistributedDataParallelWrapper for adversarial training model = DistributedDataParallelWrapper( model, device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizers(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp if use_adverserial_train: # The optimizer step process is included in the train_step function # of the model, so the runner should NOT include optimizer hook. optimizer_config = None else: # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: eval_cfg = cfg.get('evaluation', {}) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) dataloader_setting = dict( samples_per_gpu=1, workers_per_gpu=cfg.data.get('workers_per_gpu', 1), # cfg.gpus will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, drop_last=False, shuffle=False) dataloader_setting = dict(dataloader_setting, **cfg.data.get('val_dataloader', {})) val_dataloader = build_dataloader(val_dataset, **dataloader_setting) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): if type(cfg) != list: logger = get_root_logger(cfg.log_level) # prepare data loaders dataset_L = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning( '"imgs_per_gpu" is deprecated in MMDet V2.0. Please use "samples_per_gpu" instead' ) if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and "samples_per_gpu"={cfg.data.samples_per_gpu},' f' "imgs_per_gpu"={cfg.data.imgs_per_gpu} is used in this experiments' ) else: logger.warning( f'Automatically set "samples_per_gpu"="imgs_per_gpu"={cfg.data.imgs_per_gpu} in this experiments' ) cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders_L = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset_L ] data_loaders_U = None else: cfg = cfg[0] # config used in this file are the same for cfg and cfg_u logger = get_root_logger(cfg.log_level) # prepare data loaders dataset_U = dataset[1] dataset_L = dataset[0] if 'imgs_per_gpu' in cfg.data: logger.warning( '"imgs_per_gpu" is deprecated in MMDet V2.0. Please use "samples_per_gpu" instead' ) if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and "samples_per_gpu"={cfg.data.samples_per_gpu},' f' "imgs_per_gpu"={cfg.data.imgs_per_gpu} is used in this experiments' ) else: logger.warning( f'Automatically set "samples_per_gpu"="imgs_per_gpu"= {cfg.data.imgs_per_gpu} in this experiments' ) cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders_L = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset_L ] data_loaders_U = [ build_dataloader(ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset_U ] # put model on gpus if distributed: # find_unused_parameters = True # cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD(params, lr=cfg.optimizer.lr, momentum=0.9, weight_decay=0.0005) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_directory, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance( custom_hooks, list ), f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance( hook_cfg, dict ), f'Each item in custom_hooks expects dict type, but got {type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if data_loaders_U is None: runner.run(data_loaders_L, cfg.workflow, cfg.total_epochs) else: runner.run([data_loaders_L, data_loaders_U], cfg.workflow, cfg.total_epochs)
def train_network(model, dataset, cfg, distributed=False, validate=False, logger=None): if logger is None: logger = get_root_logger(cfg.log_level) multiprocessing_context = None if cfg.get('numpy_seed_hook', True) and cfg.data.workers_per_gpu > 0: # https://github.com/pytorch/pytorch/issues/5059 logger.info("Use spawn method for dataloader.") multiprocessing_context = 'spawn' data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, num_gpus=cfg.gpus, dist=distributed, multiprocessing_context=multiprocessing_context) ] # start training if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # convert to syncbn if cfg.get('syncbn', False): model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() optimizer_config = cfg.optimizer_config optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, imgs_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, drop_last=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) logger.info("Finish training... Exit... ")
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) classes_rearrange = cfg.get('classes_rearrange', False) if classes_rearrange: runner.model = rearrange_classes(runner.model, cfg.classes, cfg.dataset_type) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) import os if os.path.exists( "/workspace/code/configs/dcn/cascade_rcnn_r50_sfpn_dconv_c3-c5_1x_coco.py" ): checkpoint = torch.load( '/workspace/code/configs/dcn/cascade_rcnn_r50_sfpn_dconv_c3-c5_1x_coco.py', map_location='cpu') # remove optimizer for smaller file size if 'optimizer' in checkpoint: del checkpoint['optimizer'] if os.path.exists( "/workspace/code/work_dirs/cas_dcn_r50_temp/epoch_36.pth"): os.remove( "/workspace/code/work_dirs/cas_dcn_r50_temp/epoch_36.pth") torch.save(checkpoint, "/workspace/code/work_dirs/cas_dcn_r50_temp/epoch_36.pth") if os.path.exists( "/workspace/code/configs/dcn/cascade_rcnn_r50_sfpn_dconv_c3-c5_1x_coco.py" ): os.remove( "/workspace/code/configs/dcn/cascade_rcnn_r50_sfpn_dconv_c3-c5_1x_coco.py" )
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # if just swa training is performed, # skip building the runner for the traditional training if not cfg.get('only_swa_training', False): # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: # if just swa training is performed, there should be a starting model assert cfg.swa_resume_from is not None or cfg.swa_load_from is not None # perform swa training # build swa training runner if not cfg.get('swa_training', False): return from mmdet.core import SWAHook logger.info('Start SWA training') swa_optimizer = build_optimizer(model, cfg.swa_optimizer) swa_runner = EpochBasedRunner(model, optimizer=swa_optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same swa_runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: swa_optimizer_config = Fp16OptimizerHook(**cfg.swa_optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.swa_optimizer_config: swa_optimizer_config = OptimizerHook(**cfg.swa_optimizer_config) else: swa_optimizer_config = cfg.swa_optimizer_config # register hooks swa_runner.register_training_hooks(cfg.swa_lr_config, swa_optimizer_config, cfg.swa_checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: swa_runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook swa_runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) swa_eval = True swa_eval_hook = eval_hook(val_dataloader, **eval_cfg) else: swa_eval = False swa_eval_hook = None # register swa hook swa_hook = SWAHook(swa_eval=swa_eval, eval_hook=swa_eval_hook) swa_runner.register_hook(swa_hook, priority='LOW') # register user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) swa_runner.register_hook(hook, priority=priority) if cfg.swa_resume_from: swa_runner.resume(cfg.swa_resume_from) elif cfg.swa_load_from: # use the best pretrained model as the starting model for swa training if cfg.swa_load_from == 'best_bbox_mAP.pth': best_model_path = os.path.join(cfg.work_dir, cfg.swa_load_from) assert os.path.exists(best_model_path) # avoid the best pretrained model being overwritten new_best_model_path = os.path.join(cfg.work_dir, 'best_bbox_mAP_pretrained.pth') os.rename(best_model_path, new_best_model_path) cfg.swa_load_from = new_best_model_path swa_runner.load_checkpoint(cfg.swa_load_from) swa_runner.run(data_loaders, cfg.workflow, cfg.swa_total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner print(cfg) print("-"*20) print(cfg.optimizer) optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_segmentor(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Launch segmentor training.""" logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, drop_last=True) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=True) # TODO change else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) #TODO replace iteration based runner with epoch based runner # if cfg.get('runner') is None: # cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} if cfg.get('runner') is None: cfg.runner = {'type': 'EpochBasedRunner'} # warnings.warn( # 'config is now expected to have a `runner` section, ' # 'please set `runner` in your config.', UserWarning) # runner = build_runner( # cfg.runner, # default_args=dict( # model=model, # batch_processor=None, # optimizer=optimizer, # work_dir=cfg.work_dir, # logger=logger, # meta=meta)) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # runner = Runner(model, bp, optimizer, cfg.work_dir, # cfg.log_level) # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', True) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = EpochBasedRunner(model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # use batch size instead of per-gpu batch size if getattr(cfg.data, 'batch_size', False): num_gpus = torch.cuda.device_count() cfg.data.imgs_per_gpu = int(cfg.data.batch_size // num_gpus) print_log( f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}") data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True, shuffle=True, replace=getattr(cfg.data, 'sampling_replace', False), seed=cfg.seed, drop_last=getattr(cfg.data, 'drop_last', False), prefetch=cfg.prefetch, img_norm_cfg=cfg.img_norm_cfg) for ds in dataset ] optimizer = build_optimizer(model, cfg.optimizer) if 'use_fp16' in cfg and cfg.use_fp16: model, optimizer = apex.amp.initialize( model.cuda(), optimizer, opt_level="O1") print_log('**** Initializing mixed precision done. ****') # put model on gpus model = MMDistributedDataParallel( model if next(model.parameters()).is_cuda else model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) # build runner if not cfg.get('by_iter', False): runner = EpochBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) else: runner = IterBasedRunner( model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if not cfg.get('by_iter', False): runner.register_hook(DistSamplerSeedHook()) # register custom hooks for hook in cfg.get('custom_hooks', ()): if hook.type == 'DeepClusterHook': common_params = dict(dist_mode=True, data_loaders=data_loaders) else: common_params = dict(dist_mode=True) runner.register_hook(build_hook(hook, common_params)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if not cfg.get('by_iter', False): runner.run(data_loaders, cfg.workflow, cfg.total_epochs) else: runner.run(data_loaders, cfg.workflow, cfg.total_iters)