def _non_dist_train(model, dataset_names, cfg, validate=False, **kwargs): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.tasks_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, customized_sampler=not dataset.test_mode) for dataset in dataset_names ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processors[cfg.batch_processor_type], cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # resume if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **kwargs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # ipdb.set_trace() # prepare data loaders # 返回dataloader的迭代器,采用pytorch的DataLoader方法封装数据集 data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus # 这里多GPU输入没用list而是迭代器,注意单GPU是range(0,1),遍历的时候只有0 model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner # 这里只是将相关参数传递,没有任何的构建和运行。就像model的build一样只放了模块没有连接和顺序 runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # 注册钩子 runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # 断点加载或文件加载数据 if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train( work_dir, model_cfg, loss_cfg, dataset_cfg, optimizer_cfg, total_epochs, training_hooks, batch_size=None, gpu_batch_size=None, workflow=[('train', 1)], gpus=-1, log_level=0, workers=4, resume_from=None, load_from=None, ): # calculate batch size if gpus < 0: gpus = torch.cuda.device_count() if (batch_size is None) and (gpu_batch_size is not None): batch_size = gpu_batch_size * gpus assert batch_size is not None, 'Please appoint batch_size or gpu_batch_size.' # prepare data loaders if isinstance(dataset_cfg, dict): dataset_cfg = [dataset_cfg] data_loaders = [ torch.utils.data.DataLoader(dataset=call_obj(**d), batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) for d in dataset_cfg ] # put model on gpus if isinstance(model_cfg, list): model = [call_obj(**c) for c in model_cfg] model = torch.nn.Sequential(*model) else: model = call_obj(**model_cfg) model.apply(weights_init) model = MMDataParallel(model, device_ids=range(gpus)).cuda() loss = call_obj(**loss_cfg) # build runner optimizer = call_obj(params=model.parameters(), **optimizer_cfg) runner = Runner(model, batch_processor, optimizer, work_dir, log_level) runner.register_training_hooks(**training_hooks) if resume_from: runner.resume(resume_from) elif load_from: runner.load_checkpoint(load_from) # run workflow = [tuple(w) for w in workflow] runner.run(data_loaders, workflow, total_epochs, loss=loss)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, optimizer, cfg.work_dir, get_root_logger(cfg.log_level)) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, **kwargs): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, #repeat_samples=cfg.train_cfg.repeat_samples, **kwargs) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) logger = _init_logger(log_dir=cfg.work_dir, level=getattr(logging, cfg.log_level)) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level, cfg=cfg) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, train_dataset, cfg, validate=False): build_fn = lambda dataset, cfg_dict: build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, balanced=cfg_dict.get('balanced', False)) datasets = [train_dataset] data_loaders = [build_fn(train_dataset, cfg.data.train)] if validate: val_dataset = get_dataset(cfg.data.val) datasets.append(val_dataset) data_loaders.append(build_fn(val_dataset, cfg.data.val)) # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # load_checkpoint if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) # training runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, len(cfg.gpus.train), dist=False) ] print('dataloader built') # put model on gpus model = MMDataParallel(model, device_ids=cfg.gpus.train).cuda() print('model paralleled') optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader( dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) ''' # register eval hooks if validate: if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook(CocoDistEvalRecallHook(cfg.data.val)) else: if cfg.data.val.type == 'CocoDataset': runner.register_hook(CocoDistEvalmAPHook(cfg.data.val)) else: runner.register_hook(DistEvalmAPHook(cfg.data.val)) ''' if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train( work_dir, model_cfg, loss_cfg, dataset_cfg, optimizer_cfg, batch_size, total_epochs, training_hooks, workflow=[('train', 1)], gpus=1, log_level=0, workers=2, resume_from=None, load_from=None, ): # prepare data loaders if isinstance(dataset_cfg, dict): dataset_cfg = [dataset_cfg] data_loaders = [ torch.utils.data.DataLoader(dataset=call_obj(**d), batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True) for d in dataset_cfg ] # put model on gpus if isinstance(model_cfg, list): model = [call_obj(**c) for c in model_cfg] model = torch.nn.Sequential(*model) else: model = call_obj(**model_cfg) model.apply(weights_init) print("Model size: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) model = MMDataParallel(model, device_ids=range(gpus)).cuda() loss = call_obj(**loss_cfg) # build runner optimizer = call_obj(params=model.parameters(), **optimizer_cfg) runner = Runner(model, batch_processor, optimizer, work_dir, log_level) runner.register_training_hooks(**training_hooks) if resume_from: runner.resume(resume_from) elif load_from: runner.load_checkpoint(load_from) # run workflow = [tuple(w) for w in workflow] runner.run(data_loaders, workflow, total_epochs, loss=loss) writer.export_scalars_to_json("./all_scalars.json") writer.close()
def train(cfg_path, dataset_class): """借用mmcv的Runner框架进行训练,包括里边的hooks作为lr更新,loss计算的工具 1. dataset的数据集输出打包了img/gt_bbox/label/,采用DataContainer封装 2. Dataloader的default_collate用定制collate替换,从而支持dataset的多类型数据 3. DataParallel外壳用定制MMDataparallel替换,从而支持DataContainer """ # 初始化2个默认选项 distributed = False parallel = True # get cfg cfg = Config.fromfile(cfg_path) # set backends if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # get logger logger = get_root_logger(cfg.log_level) logger.info('Distributed training: {}'.format(distributed)) logger.info('DataParallel training: {}'.format(parallel)) # build model & detector # model = M2detDetector(cfg) model = M2detDetector(cfg) if not parallel: model = model.cuda() else: model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # prepare data & dataloader # Runner要求dataloader放在list里: 使workflow里每个flow对应一个dataloader dataset = get_dataset(cfg.data.train, dataset_class) batch_size = cfg.gpus * cfg.data.imgs_per_gpu num_workers = cfg.gpus * cfg.data.workers_per_gpu dataloader = [ DataLoader(dataset, batch_size=batch_size, sampler=GroupSampler(dataset, cfg.data.imgs_per_gpu), num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=cfg.data.imgs_per_gpu), pin_memory=False) ] # define runner and running type(1.resume, 2.load, 3.train/test) runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: # 恢复训练: './work_dirs/ssd300_voc/latest.pth' runner.resume(cfg.resume_from, map_location=lambda storage, loc: storage) elif cfg.load_from: # 加载参数进行测试 runner.load_checkpoint(cfg.load_from) # 开始训练: 采用workflow来区分train还是test runner.run(dataloader, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): if validate: raise NotImplementedError('Built-in validation is not implemented ' 'yet in not-distributed training. Use ' 'distributed training or test.py and ' '*eval.py scripts instead.') # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer, cfg.get('optimizer_exclude_arch')) arch_name = None optimizer_arch = None if 'optimizer_arch' in cfg: raise NotImplementedError runner = Runner(model, batch_processor, optimizer, optimizer_arch, cfg.work_dir, cfg.log_level, arch_name=arch_name) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config optimizer_arch_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, optimizer_arch_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if 'optimizer_arch' in cfg: raise NotImplementedError else: data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)
def runner(self): def parse_losses(losses): log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError( '{} is not a tensor or list of tensors'.format( loss_name)) loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss for name in log_vars: log_vars[name] = log_vars[name].item() return loss, log_vars def batch_processor(model, data, train_mode): losses = model(**data) # losses = model(data) loss, log_vars = parse_losses(losses) outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data['batchdata'].data)) return outputs self.runner = Runner(self.model, batch_processor, self.optimizer, self.arg.work_dir) optimizer_config = DistOptimizerHook( grad_clip=dict(max_norm=20, norm_type=2)) if not "policy" in self.arg.policy: lr_config = dict(policy='step', step=self.arg.step) else: lr_config = dict(**self.arg.policy) checkpoint_config = dict(interval=5) log_config = dict(interval=20, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) self.runner.register_training_hooks(lr_config, optimizer_config, checkpoint_config, log_config) self.runner.register_hook(DistSamplerSeedHook()) Feeder = import_class(self.arg.feeder) self.runner.register_hook( DistEvalTopKAccuracyHook(Feeder(**self.arg.test_feeder_args), interval=self.arg.test_interval, k=(1, 5)))
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, 0, #cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # model.eval() # not original # model = model.cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) # for param in model.parameters(): # not original # param.requires_grad = False runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # original cfg.log_level) # runner = RunnerNoBackward(model, batch_processor, optimizer, cfg.work_dir, # cfg.log_level) # not original # Add for LVIS by LiYu import logging runner.logger.setLevel(logging.INFO) # ==================== # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _single_train(model, data_loaders, cfg): if cfg.gpus > 1: raise NotImplemented # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, multitask=False, vis=False): # prepare data loaders data_loaders = [[ build_dataloader( d, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) for d in dataset ]] if multitask else [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) bp = batch_processor_with_vis if vis else batch_processor runner = MTLRunner(model, bp, optimizer, cfg.work_dir, cfg.log_level) if multitask else \ Runner(model, bp, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) if cfg.get('init_asso_head', False): ori_key = cfg.init_asso_head[0] new_key = cfg.init_asso_head[1] for _key in model.module.state_dict().keys(): if 'asso' in _key: exist_key = _key.replace(ori_key, new_key) if exist_key in model.module.state_dict().keys(): print('Init "{}" with "{}"'.format(_key, exist_key)) model.module.state_dict()[_key].copy_( model.module.state_dict()[exist_key]) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False, logger=None, ignores=None): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] num_steps_per_epoch = len(data_loaders[0]) if hasattr(model, 'update_state'): model.update_state(num_steps_per_epoch) if cfg.load_from: load_checkpoint(model, cfg.load_from, strict=False, logger=logger, show_converted=True, ignores=ignores) if hasattr(cfg, 'model_partial_init') and cfg.model_partial_init: model.reset_weights() # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) # fix warm-up bug if hasattr(cfg.lr_config, 'warmup_iters'): if not hasattr(cfg.lr_config, 'by_epoch') or cfg.lr_config.by_epoch: cfg.lr_config.warmup_iters *= len(data_loaders[0]) # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config # import mmcv.runner.hooks.logger as mmcv_logger # mmcv_logger.LoggerHook runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) # To crete a `latest.pth` file to run recursively # runner._epoch -= 1 # runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth') # runner._epoch += 1 if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False, ) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) #hard fix for incorrect runner logger level due to environment issues import logging runner.logger.setLevel(logging.INFO) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): raise ValueError("This function has been henceforth disabled.") # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.videos_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] # 确认dataset是一个list或者tuple # template_dataset = template_dataset if isinstance(template_dataset, (list, tuple)) else [template_dataset] # 确认dataset是一个list或者tuple # load the data for per batch data_loaders = [ # 调用该函数可实现每次返回minibatch张图片 build_dataloader( ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) for ds in dataset ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # 将模型送到GPU中,how to set data into gpus # build runner optimizer = build_optimizer(model, cfg.optimizer) # 建立优化器 runner = Runner(model, batch_processor, optimizer, cfg.work_dir, # 建立辅助Runner用来跑模型,进入后调用里面的train函数 cfg.log_level) # fp16 setting # 应当是关于FPN层的设置 fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=False) else: optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: # 选择恢复训练或者重新加载模型 runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs) # 调用runner的run方法进行训练
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders #导入数据,并且获得数据相关的配置 data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # 更新和计算影响模型训练和模型输出的网咯参数,使其逼近或达到最优值,从而最小化损失函数,使用各参数的梯度值来最小化损失函数,最常用的一阶优化算法是梯度下降 # build runner optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range( cfg.gpus)).cuda() #for multiple GPU # refer : torch.nn.DataParallel(model) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # runner : https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/runner.py # fp16 setting fp16_cfg = cfg.get('fp16', None) # print('fp16_cfg:',fp16_cfg) # None if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=False) else: #default! optimizer_config = cfg.optimizer_config runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_flownet(model, dataset, cfg, distributed=False, validate=False, logger=None): if logger is None: logger = get_root_logger(cfg.log_level) # start training # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, cfg.log_level) runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) # if cfg.resume_from: # runner.resume(cfg.resume_from) # elif cfg.load_from: # runner.load_checkpoint(cfg.load_from) model.eval() for param in model.parameters(): param.requires_grad = False # model.load_flow() model.module.flow_head.train() for param in model.module.flow_head.parameters(): param.requires_grad = True # training runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ build_dataloader(dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level) # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config) runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset_cfg = cfg.data.val eval_cfg = cfg.get('evaluation', {}) if isinstance(model.module, RPN): # TODO: implement recall hooks for other datasets runner.register_hook( CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg)) else: dataset_type = getattr(datasets, val_dataset_cfg.type) if issubclass(dataset_type, datasets.CocoDataset): runner.register_hook( CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg)) else: runner.register_hook( DistEvalmAPHook(val_dataset_cfg, **eval_cfg)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, logger=logger, meta=meta) # defrost backbone hook when_defrost = cfg.get('when_defrost') if when_defrost is not None: if when_defrost < 0: raise RuntimeError('when_defrost < 0') frozen_stages = cfg.get('frozen_stages', -1) defrost_backbone = DefrostBackbone(when_defrost, frozen_stages) runner.register_hook(defrost_backbone) # log hook custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost, os.path.join(cfg.work_dir, 'log.txt')) runner.register_hook(custom_log) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) if cfg.resume_from: runner.resume( cfg.resume_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) elif cfg.load_from: runner.load_checkpoint( cfg.load_from, map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0])) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, train_dataset, cfg, eval_dataset=None, vis_dataset=None, validate=False, logger=None): # prepare data loaders data_loaders = [ build_data_loader(train_dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True) ] if cfg.apex.synced_bn: # using apex synced BN model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # build optimizer optimizer = build_optimizer(model, cfg.optimizer) # Initialize mixed-precision training if cfg.apex.use_mixed_precision: amp_opt_level = 'O1' if cfg.apex.type == "float16" else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level, loss_scale=cfg.apex.loss_scale) # put model on gpus model = MMDistributedDataParallel(model) # build runner runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level, logger) # register optimizer hooks if cfg.apex.use_mixed_precision: optimizer_config = DistApexOptimizerHook(**cfg.optimizer_config) else: optimizer_config = DistOptimizerHook(**cfg.optimizer_config) logger.info("Register Optimizer Hook...") runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, log_config=None) # register self-defined logging hooks for info in cfg.log_config['hooks']: assert isinstance(info, dict) and 'type' in info if info['type'] in ['TensorboardLoggerHook']: logger.info("Register Tensorboard Logger Hook...") runner.register_hook(TensorboardLoggerHook( interval=cfg.log_config.interval, register_logWithIter_keyword=['loss']), priority='VERY_LOW') if info['type'] in ['TextLoggerHook']: logger.info("Register Text Logger Hook...") runner.register_hook(TextLoggerHook( interval=cfg.log_config.interval, ), priority='VERY_LOW') logger.info("Register SamplerSeed Hook...") runner.register_hook(DistSamplerSeedHook()) logger.info("Register EmptyCache Hook...") runner.register_hook(EmptyCacheHook(before_epoch=True, after_iter=False, after_epoch=True), priority='VERY_LOW') # register eval hooks if validate: interval = cfg.get('validate_interval', 1) if eval_dataset is not None: logger.info("Register Evaluation Hook...") runner.register_hook( DistStereoEvalHook(cfg, eval_dataset, interval)) if vis_dataset is not None: logger.info("Register Visualization hook...") runner.register_hook(DistStereoVisHook(vis_dataset, cfg, interval)) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, train_dataset, cfg, eval_dataset=None, vis_dataset=None, validate=False, logger=None): # prepare data loaders data_loaders = [ build_data_loader(train_dataset, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, cfg.gpus, dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() # build runner optimizer = build_optimizer(model, cfg.optimizer) runner = Runner(model, batch_processor, optimizer, cfg.work_dir, cfg.log_level, logger) logger.info("Register Optimizer Hook...") runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config) logger.info("Register EmptyCache Hook...") runner.register_hook(EmptyCacheHook(before_epoch=True, after_iter=False, after_epoch=True), priority='VERY_LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)