def train(model, datasets, cfg, rank): data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets] runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) runner.create_gradmanager_and_optimizers() if cfg.resume_from is not None: runner.resume(cfg.resume_from, cfg.get('resume_optim', True)) elif cfg.load_from is not None: runner.load_checkpoint(cfg.load_from, load_optim=False) else: pass runner.sync_model_params() # register some useful hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # register evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = os.path.join(cfg.work_dir, 'eval.log') runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def train(model, datasets, cfg, rank): data_loaders = [] for ds in datasets: data_loaders.append(get_loader(ds, cfg, 'train')) # build runner for training if cfg.get('total_iters', None) is not None: runner = IterBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) total_iters_or_epochs = cfg.total_iters else: runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) assert cfg.get('total_epochs', None) is not None total_iters_or_epochs = cfg.total_epochs # resume and create optimizers if cfg.resume_from is not None: # 恢复之前的训练(包括模型参数和优化器) runner.resume(cfg.resume_from, cfg.get('resume_optim', False)) elif cfg.load_from is not None: # 假装从头开始训练, rank0 进程加载参数,然后每个进程创建optim,调用optim init时,模型参数会自动同步 runner.load_checkpoint(cfg.load_from, load_optim=False) runner.create_optimizers() else: # 不加载任何参数,每个进程直接创建optimizers runner.create_optimizers() # register hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # visual hook if cfg.get('visual_config', None) is not None: cfg.visual_config['output_dir'] = os.path.join( cfg.work_dir, cfg.visual_config['output_dir']) runner.register_hook(build_from_cfg(cfg.visual_config, HOOKS)) # evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = cfg.work_dir runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, total_iters_or_epochs)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) # 此时参数已经随机化完成 datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger( log_file=log_file, log_level=cfg.log_level ) # 给每个进程创立自己的root logger,但只有rank0的创建文件,其余的不创建且为error级别 model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)
def train(model, datasets, cfg, rank): data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets] runner = EpochBasedRunner(model=model, optimizers_cfg=cfg.optimizers, work_dir=cfg.work_dir) runner.create_gradmanager_and_optimizers( ) # 每个进程均创建gm和optimizers, 均是model的属性 if cfg.resume_from is not None: # 恢复之前的训练,即epoch数目(包括模型参数和优化器)。若多卡训练则只有rank 0进程对模型加载参数(后面会同步)。如果resume optim,则每个进程均会load optim state. runner.resume(cfg.resume_from, cfg.get('resume_optim', True)) elif cfg.load_from is not None: # 加载参数,但假装从头开始训练。若多卡训练则只有rank 0进程对模型加载参数 (后面会同步)。 runner.load_checkpoint(cfg.load_from, load_optim=False) else: pass # 不加载任何参数,从头训练 # 对模型参数进行同步 runner.sync_model_params() # register some useful hooks runner.register_training_hooks(lr_config=cfg.lr_config, checkpoint_config=cfg.checkpoint_config, log_config=cfg.log_config) # register evaluation hook if cfg.get('evaluation', None) is not None: dataset = build_dataset(cfg.data.eval) save_path = os.path.join(cfg.work_dir, 'eval_visuals') log_path = os.path.join(cfg.work_dir, 'eval.log') runner.register_hook( EvalIterHook(get_loader(dataset, cfg, 'eval'), save_path=save_path, log_path=log_path, **cfg.evaluation)) runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger # set dynamic graph for debug if cfg.dynamic: trace.enabled = False if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)