Esempio n. 1
0
def train(model, datasets, cfg, rank):
    data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets]
    runner = EpochBasedRunner(model=model,
                              optimizers_cfg=cfg.optimizers,
                              work_dir=cfg.work_dir)

    runner.create_gradmanager_and_optimizers()

    if cfg.resume_from is not None:
        runner.resume(cfg.resume_from, cfg.get('resume_optim', True))
    elif cfg.load_from is not None:
        runner.load_checkpoint(cfg.load_from, load_optim=False)
    else:
        pass

    runner.sync_model_params()

    # register some useful hooks
    runner.register_training_hooks(lr_config=cfg.lr_config,
                                   checkpoint_config=cfg.checkpoint_config,
                                   log_config=cfg.log_config)

    # register evaluation hook
    if cfg.get('evaluation', None) is not None:
        dataset = build_dataset(cfg.data.eval)
        save_path = os.path.join(cfg.work_dir, 'eval_visuals')
        log_path = os.path.join(cfg.work_dir, 'eval.log')
        runner.register_hook(
            EvalIterHook(get_loader(dataset, cfg, 'eval'),
                         save_path=save_path,
                         log_path=log_path,
                         **cfg.evaluation))

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 2
0
def train(model, datasets, cfg, rank):
    data_loaders = []
    for ds in datasets:
        data_loaders.append(get_loader(ds, cfg, 'train'))

    # build runner for training
    if cfg.get('total_iters', None) is not None:
        runner = IterBasedRunner(model=model,
                                 optimizers_cfg=cfg.optimizers,
                                 work_dir=cfg.work_dir)
        total_iters_or_epochs = cfg.total_iters
    else:
        runner = EpochBasedRunner(model=model,
                                  optimizers_cfg=cfg.optimizers,
                                  work_dir=cfg.work_dir)
        assert cfg.get('total_epochs', None) is not None
        total_iters_or_epochs = cfg.total_epochs

    # resume and create optimizers
    if cfg.resume_from is not None:
        # 恢复之前的训练(包括模型参数和优化器)
        runner.resume(cfg.resume_from, cfg.get('resume_optim', False))
    elif cfg.load_from is not None:
        # 假装从头开始训练, rank0 进程加载参数,然后每个进程创建optim,调用optim init时,模型参数会自动同步
        runner.load_checkpoint(cfg.load_from, load_optim=False)
        runner.create_optimizers()
    else:
        # 不加载任何参数,每个进程直接创建optimizers
        runner.create_optimizers()

    # register hooks
    runner.register_training_hooks(lr_config=cfg.lr_config,
                                   checkpoint_config=cfg.checkpoint_config,
                                   log_config=cfg.log_config)

    # visual hook
    if cfg.get('visual_config', None) is not None:
        cfg.visual_config['output_dir'] = os.path.join(
            cfg.work_dir, cfg.visual_config['output_dir'])
        runner.register_hook(build_from_cfg(cfg.visual_config, HOOKS))

    # evaluation hook
    if cfg.get('evaluation', None) is not None:
        dataset = build_dataset(cfg.data.eval)
        save_path = os.path.join(cfg.work_dir, 'eval_visuals')
        log_path = cfg.work_dir
        runner.register_hook(
            EvalIterHook(get_loader(dataset, cfg, 'eval'),
                         save_path=save_path,
                         log_path=log_path,
                         **cfg.evaluation))

    runner.run(data_loaders, cfg.workflow, total_iters_or_epochs)
Esempio n. 3
0
def worker(rank, world_size, cfg):
    logger = get_root_logger()  # 每个进程再创建一个logger
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))
        dist.init_process_group(
            master_ip="localhost",
            master_port=23333,
            world_size=world_size,
            rank=rank,
            dev=rank % 8,
        )
    model = build_model(cfg.model,
                        train_cfg=cfg.train_cfg,
                        eval_cfg=cfg.eval_cfg)
    datasets = [build_dataset(cfg.data.train)]
    train(model, datasets, cfg, rank)
Esempio n. 4
0
def worker(rank, world_size, cfg, gpu_id="0", port=23333):
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        dist.init_process_group(
            master_ip="localhost",
            port=port,
            world_size=world_size,
            rank=rank,
            device=int(gpu_id) % 10,
        )
        log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank))
        logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    model = build_model(cfg.model,
                        train_cfg=cfg.train_cfg,
                        eval_cfg=cfg.eval_cfg)  # 此时参数已经随机化完成
    datasets = [build_dataset(cfg.data.train)]
    train(model, datasets, cfg, rank)
Esempio n. 5
0
def worker(rank, world_size, cfg, gpu_id="0", port=23333):
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        dist.init_process_group(
            master_ip="localhost",
            port=port,
            world_size=world_size,
            rank=rank,
            device=int(gpu_id) % 10,
        )
        log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank))
        logger = get_root_logger(
            log_file=log_file, log_level=cfg.log_level
        )  # 给每个进程创立自己的root logger,但只有rank0的创建文件,其余的不创建且为error级别
    model = build_model(
        cfg.model, eval_cfg=cfg.eval_cfg
    )  # eval cfg can provide some useful info, e.g. the padding multi
    datasets = [build_dataset(cfg.data.test)]
    test(model, datasets, cfg, rank)
Esempio n. 6
0
def train(model, datasets, cfg, rank):
    data_loaders = [get_loader(ds, cfg, 'train') for ds in datasets]
    runner = EpochBasedRunner(model=model,
                              optimizers_cfg=cfg.optimizers,
                              work_dir=cfg.work_dir)

    runner.create_gradmanager_and_optimizers(
    )  # 每个进程均创建gm和optimizers, 均是model的属性

    if cfg.resume_from is not None:
        # 恢复之前的训练,即epoch数目(包括模型参数和优化器)。若多卡训练则只有rank 0进程对模型加载参数(后面会同步)。如果resume optim,则每个进程均会load optim state.
        runner.resume(cfg.resume_from, cfg.get('resume_optim', True))
    elif cfg.load_from is not None:
        # 加载参数,但假装从头开始训练。若多卡训练则只有rank 0进程对模型加载参数 (后面会同步)。
        runner.load_checkpoint(cfg.load_from, load_optim=False)
    else:
        pass  # 不加载任何参数,从头训练

    # 对模型参数进行同步
    runner.sync_model_params()

    # register some useful hooks
    runner.register_training_hooks(lr_config=cfg.lr_config,
                                   checkpoint_config=cfg.checkpoint_config,
                                   log_config=cfg.log_config)

    # register evaluation hook
    if cfg.get('evaluation', None) is not None:
        dataset = build_dataset(cfg.data.eval)
        save_path = os.path.join(cfg.work_dir, 'eval_visuals')
        log_path = os.path.join(cfg.work_dir, 'eval.log')
        runner.register_hook(
            EvalIterHook(get_loader(dataset, cfg, 'eval'),
                         save_path=save_path,
                         log_path=log_path,
                         **cfg.evaluation))

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 7
0
def worker(rank, world_size, cfg):
    logger = get_root_logger()  # 每个进程再创建一个logger

    # set dynamic graph for debug
    if cfg.dynamic:
        trace.enabled = False

    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))
        dist.init_process_group(
            master_ip="localhost",
            master_port=23333,
            world_size=world_size,
            rank=rank,
            dev=rank % 8,
        )
    model = build_model(
        cfg.model, eval_cfg=cfg.eval_cfg
    )  # eval cfg can provide some useful info, e.g. the padding multi
    datasets = [build_dataset(cfg.data.test)]
    test(model, datasets, cfg, rank)