def _non_dist_train(model, dataset_names, cfg, validate=False, **kwargs):

    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.tasks_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         customized_sampler=not dataset.test_mode)
        for dataset in dataset_names
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processors[cfg.batch_processor_type],
                    cfg.optimizer, cfg.work_dir, cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # resume
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **kwargs)
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False):
    # ipdb.set_trace()
    # prepare data loaders
    # 返回dataloader的迭代器,采用pytorch的DataLoader方法封装数据集
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus # 这里多GPU输入没用list而是迭代器,注意单GPU是range(0,1),遍历的时候只有0
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner  # 这里只是将相关参数传递,没有任何的构建和运行。就像model的build一样只放了模块没有连接和顺序
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    # 注册钩子
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # 断点加载或文件加载数据
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #4
0
def train(
    work_dir,
    model_cfg,
    loss_cfg,
    dataset_cfg,
    optimizer_cfg,
    total_epochs,
    training_hooks,
    batch_size=None,
    gpu_batch_size=None,
    workflow=[('train', 1)],
    gpus=-1,
    log_level=0,
    workers=4,
    resume_from=None,
    load_from=None,
):

    # calculate batch size
    if gpus < 0:
        gpus = torch.cuda.device_count()
    if (batch_size is None) and (gpu_batch_size is not None):
        batch_size = gpu_batch_size * gpus
    assert batch_size is not None, 'Please appoint batch_size or gpu_batch_size.'

    # prepare data loaders
    if isinstance(dataset_cfg, dict):
        dataset_cfg = [dataset_cfg]
    data_loaders = [
        torch.utils.data.DataLoader(dataset=call_obj(**d),
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=workers,
                                    drop_last=True) for d in dataset_cfg
    ]

    # put model on gpus
    if isinstance(model_cfg, list):
        model = [call_obj(**c) for c in model_cfg]
        model = torch.nn.Sequential(*model)
    else:
        model = call_obj(**model_cfg)
    model.apply(weights_init)

    model = MMDataParallel(model, device_ids=range(gpus)).cuda()
    loss = call_obj(**loss_cfg)

    # build runner
    optimizer = call_obj(params=model.parameters(), **optimizer_cfg)
    runner = Runner(model, batch_processor, optimizer, work_dir, log_level)
    runner.register_training_hooks(**training_hooks)

    if resume_from:
        runner.resume(resume_from)
    elif load_from:
        runner.load_checkpoint(load_from)

    # run
    workflow = [tuple(w) for w in workflow]
    runner.run(data_loaders, workflow, total_epochs, loss=loss)
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                momentum=0.9,
                                weight_decay=0.0001)
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    get_root_logger(cfg.log_level))
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #6
0
def _non_dist_train(model, dataset, cfg, validate=False, **kwargs):
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            #repeat_samples=cfg.train_cfg.repeat_samples,
            **kwargs)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    logger = _init_logger(log_dir=cfg.work_dir, level=getattr(logging, cfg.log_level))
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    logger)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #7
0
def _non_dist_train(model, dataset, cfg, validate=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True)
        for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level, cfg=cfg)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #8
0
def _non_dist_train(model, train_dataset, cfg, validate=False):
    build_fn = lambda dataset, cfg_dict: build_dataloader(
        dataset,
        cfg.data.imgs_per_gpu,
        cfg.data.workers_per_gpu,
        cfg.gpus,
        dist=False,
        balanced=cfg_dict.get('balanced', False))

    datasets = [train_dataset]
    data_loaders = [build_fn(train_dataset, cfg.data.train)]
    if validate:
        val_dataset = get_dataset(cfg.data.val)
        datasets.append(val_dataset)
        data_loaders.append(build_fn(val_dataset, cfg.data.val))
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #9
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # load_checkpoint
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    # training
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #10
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            len(cfg.gpus.train),
            dist=False)
    ]
    print('dataloader built')

    # put model on gpus
    model = MMDataParallel(model, device_ids=cfg.gpus.train).cuda()
    print('model paralleled')

    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #11
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    '''
    # register eval hooks
    if validate:
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
        else:
            if cfg.data.val.type == 'CocoDataset':
                runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
            else:
                runner.register_hook(DistEvalmAPHook(cfg.data.val))
    '''
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #12
0
def train(
    work_dir,
    model_cfg,
    loss_cfg,
    dataset_cfg,
    optimizer_cfg,
    batch_size,
    total_epochs,
    training_hooks,
    workflow=[('train', 1)],
    gpus=1,
    log_level=0,
    workers=2,
    resume_from=None,
    load_from=None,
):

    # prepare data loaders
    if isinstance(dataset_cfg, dict):
        dataset_cfg = [dataset_cfg]
    data_loaders = [
        torch.utils.data.DataLoader(dataset=call_obj(**d),
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=workers,
                                    drop_last=True) for d in dataset_cfg
    ]

    # put model on gpus
    if isinstance(model_cfg, list):
        model = [call_obj(**c) for c in model_cfg]
        model = torch.nn.Sequential(*model)
    else:
        model = call_obj(**model_cfg)
    model.apply(weights_init)
    print("Model size: ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))
    model = MMDataParallel(model, device_ids=range(gpus)).cuda()
    loss = call_obj(**loss_cfg)

    # build runner
    optimizer = call_obj(params=model.parameters(), **optimizer_cfg)
    runner = Runner(model, batch_processor, optimizer, work_dir, log_level)
    runner.register_training_hooks(**training_hooks)

    if resume_from:
        runner.resume(resume_from)
    elif load_from:
        runner.load_checkpoint(load_from)

    # run
    workflow = [tuple(w) for w in workflow]
    runner.run(data_loaders, workflow, total_epochs, loss=loss)

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Example #13
0
def train(cfg_path, dataset_class):
    """借用mmcv的Runner框架进行训练,包括里边的hooks作为lr更新,loss计算的工具
    1. dataset的数据集输出打包了img/gt_bbox/label/,采用DataContainer封装
    2. Dataloader的default_collate用定制collate替换,从而支持dataset的多类型数据
    3. DataParallel外壳用定制MMDataparallel替换,从而支持DataContainer
    """
    # 初始化2个默认选项
    distributed = False
    parallel = True

    # get cfg
    cfg = Config.fromfile(cfg_path)

    # set backends
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # get logger
    logger = get_root_logger(cfg.log_level)
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('DataParallel training: {}'.format(parallel))
    # build model & detector
    #    model = M2detDetector(cfg)
    model = M2detDetector(cfg)
    if not parallel:
        model = model.cuda()
    else:
        model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # prepare data & dataloader
    # Runner要求dataloader放在list里: 使workflow里每个flow对应一个dataloader
    dataset = get_dataset(cfg.data.train, dataset_class)
    batch_size = cfg.gpus * cfg.data.imgs_per_gpu
    num_workers = cfg.gpus * cfg.data.workers_per_gpu
    dataloader = [
        DataLoader(dataset,
                   batch_size=batch_size,
                   sampler=GroupSampler(dataset, cfg.data.imgs_per_gpu),
                   num_workers=num_workers,
                   collate_fn=partial(collate,
                                      samples_per_gpu=cfg.data.imgs_per_gpu),
                   pin_memory=False)
    ]

    # define runner and running type(1.resume, 2.load, 3.train/test)
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    if cfg.resume_from:  # 恢复训练: './work_dirs/ssd300_voc/latest.pth'
        runner.resume(cfg.resume_from,
                      map_location=lambda storage, loc: storage)
    elif cfg.load_from:  # 加载参数进行测试
        runner.load_checkpoint(cfg.load_from)
    # 开始训练: 采用workflow来区分train还是test
    runner.run(dataloader, cfg.workflow, cfg.total_epochs)
Example #14
0
def _non_dist_train(model, dataset, cfg, validate=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer,
                                cfg.get('optimizer_exclude_arch'))

    arch_name = None
    optimizer_arch = None
    if 'optimizer_arch' in cfg:
        raise NotImplementedError

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    optimizer_arch,
                    cfg.work_dir,
                    cfg.log_level,
                    arch_name=arch_name)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
        optimizer_arch_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   optimizer_arch_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    if 'optimizer_arch' in cfg:
        raise NotImplementedError
    else:
        data_loaders = [
            build_dataloader(dataset,
                             cfg.data.imgs_per_gpu,
                             cfg.data.workers_per_gpu,
                             cfg.gpus,
                             dist=False)
        ]
        runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)
Example #15
0
    def runner(self):
        def parse_losses(losses):
            log_vars = OrderedDict()
            for loss_name, loss_value in losses.items():
                if isinstance(loss_value, torch.Tensor):
                    log_vars[loss_name] = loss_value.mean()
                elif isinstance(loss_value, list):
                    log_vars[loss_name] = sum(_loss.mean()
                                              for _loss in loss_value)
                else:
                    raise TypeError(
                        '{} is not a tensor or list of tensors'.format(
                            loss_name))

            loss = sum(_value for _key, _value in log_vars.items()
                       if 'loss' in _key)

            log_vars['loss'] = loss
            for name in log_vars:
                log_vars[name] = log_vars[name].item()

            return loss, log_vars

        def batch_processor(model, data, train_mode):
            losses = model(**data)
            # losses = model(data)
            loss, log_vars = parse_losses(losses)
            outputs = dict(loss=loss,
                           log_vars=log_vars,
                           num_samples=len(data['batchdata'].data))
            return outputs

        self.runner = Runner(self.model, batch_processor, self.optimizer,
                             self.arg.work_dir)
        optimizer_config = DistOptimizerHook(
            grad_clip=dict(max_norm=20, norm_type=2))
        if not "policy" in self.arg.policy:
            lr_config = dict(policy='step', step=self.arg.step)
        else:
            lr_config = dict(**self.arg.policy)
        checkpoint_config = dict(interval=5)
        log_config = dict(interval=20,
                          hooks=[
                              dict(type='TextLoggerHook'),
                              dict(type='TensorboardLoggerHook')
                          ])
        self.runner.register_training_hooks(lr_config, optimizer_config,
                                            checkpoint_config, log_config)
        self.runner.register_hook(DistSamplerSeedHook())
        Feeder = import_class(self.arg.feeder)
        self.runner.register_hook(
            DistEvalTopKAccuracyHook(Feeder(**self.arg.test_feeder_args),
                                     interval=self.arg.test_interval,
                                     k=(1, 5)))
Example #16
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            0, #cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # model.eval()  # not original
    # model = model.cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    # for param in model.parameters():  # not original
    #     param.requires_grad = False

    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,  #  original
                    cfg.log_level)

    # runner = RunnerNoBackward(model, batch_processor, optimizer, cfg.work_dir,
    #                 cfg.log_level)  # not original

    # Add for LVIS by LiYu
    import logging
    runner.logger.setLevel(logging.INFO)
    # ====================

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _single_train(model, data_loaders, cfg):
    if cfg.gpus > 1:
        raise NotImplemented
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #18
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    multitask=False,
                    vis=False):
    # prepare data loaders
    data_loaders = [[
        build_dataloader(
            d, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True)
        for d in dataset
    ]] if multitask else [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    bp = batch_processor_with_vis if vis else batch_processor
    runner = MTLRunner(model, bp, optimizer, cfg.work_dir,
                    cfg.log_level) if multitask else \
             Runner(model, bp, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    if cfg.get('init_asso_head', False):
        ori_key = cfg.init_asso_head[0]
        new_key = cfg.init_asso_head[1]
        for _key in model.module.state_dict().keys():
            if 'asso' in _key:
                exist_key = _key.replace(ori_key, new_key)
                if exist_key in model.module.state_dict().keys():
                    print('Init "{}" with "{}"'.format(_key, exist_key))
                    model.module.state_dict()[_key].copy_(
                        model.module.state_dict()[exist_key])
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #19
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    ignores=None):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    num_steps_per_epoch = len(data_loaders[0])

    if hasattr(model, 'update_state'):
        model.update_state(num_steps_per_epoch)

    if cfg.load_from:
        load_checkpoint(model,
                        cfg.load_from,
                        strict=False,
                        logger=logger,
                        show_converted=True,
                        ignores=ignores)

        if hasattr(cfg, 'model_partial_init') and cfg.model_partial_init:
            model.reset_weights()

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)

    # fix warm-up bug
    if hasattr(cfg.lr_config, 'warmup_iters'):
        if not hasattr(cfg.lr_config, 'by_epoch') or cfg.lr_config.by_epoch:
            cfg.lr_config.warmup_iters *= len(data_loaders[0])

    # register hooks
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #20
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    # import mmcv.runner.hooks.logger as mmcv_logger
    # mmcv_logger.LoggerHook
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # To crete a `latest.pth` file to run recursively
    # runner._epoch -= 1
    # runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth')
    # runner._epoch += 1
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #21
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
        ) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    #hard fix for incorrect runner logger level due to environment issues
    import logging
    runner.logger.setLevel(logging.INFO)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #22
0
def _non_dist_train(model, dataset, cfg, validate=False):
    raise ValueError("This function has been henceforth disabled.")
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #23
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]   # 确认dataset是一个list或者tuple
    # template_dataset = template_dataset if isinstance(template_dataset, (list, tuple)) else [template_dataset]  # 确认dataset是一个list或者tuple

    # load the data for per batch
    data_loaders = [                                 # 调用该函数可实现每次返回minibatch张图片
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()   # 将模型送到GPU中,how to set data into gpus

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)                   # 建立优化器
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,    # 建立辅助Runner用来跑模型,进入后调用里面的train函数
                    cfg.log_level)
    # fp16 setting                                                      # 应当是关于FPN层的设置
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:                                                  # 选择恢复训练或者重新加载模型
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)              # 调用runner的run方法进行训练
Example #24
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    #导入数据,并且获得数据相关的配置
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # 更新和计算影响模型训练和模型输出的网咯参数,使其逼近或达到最优值,从而最小化损失函数,使用各参数的梯度值来最小化损失函数,最常用的一阶优化算法是梯度下降
    # build runner optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #25
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(
        cfg.gpus)).cuda()  #for multiple GPU
    # refer : torch.nn.DataParallel(model)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # runner : https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/runner.py
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    # print('fp16_cfg:',fp16_cfg) # None
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:  #default!
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #26
0
def train_flownet(model,
                  dataset,
                  cfg,
                  distributed=False,
                  validate=False,
                  logger=None):
    if logger is None:
        logger = get_root_logger(cfg.log_level)

    # start training
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # if cfg.resume_from:
    #     runner.resume(cfg.resume_from)
    # elif cfg.load_from:
    #     runner.load_checkpoint(cfg.load_from)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False
    # model.load_flow()
    model.module.flow_head.train()
    for param in model.module.flow_head.parameters():
        param.requires_grad = True
    # training
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = getattr(datasets, val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #28
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)

    # defrost backbone hook
    when_defrost = cfg.get('when_defrost')
    if when_defrost is not None:
        if when_defrost < 0:
            raise RuntimeError('when_defrost < 0')
        frozen_stages = cfg.get('frozen_stages', -1)
        defrost_backbone = DefrostBackbone(when_defrost, frozen_stages)
        runner.register_hook(defrost_backbone)
    # log hook
    custom_log = CustomLog(cfg.data.samples_per_gpu, when_defrost,
                           os.path.join(cfg.work_dir, 'log.txt'))
    runner.register_hook(custom_log)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(
            cfg.resume_from,
            map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0]))
    elif cfg.load_from:
        runner.load_checkpoint(
            cfg.load_from,
            map_location=lambda storage, loc: storage.cuda(cfg.gpu_ids[0]))
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #29
0
def _dist_train(model,
                train_dataset,
                cfg,
                eval_dataset=None,
                vis_dataset=None,
                validate=False,
                logger=None):
    # prepare data loaders
    data_loaders = [
        build_data_loader(train_dataset,
                          cfg.data.imgs_per_gpu,
                          cfg.data.workers_per_gpu,
                          dist=True)
    ]
    if cfg.apex.synced_bn:
        # using apex synced BN
        model = apex.parallel.convert_syncbn_model(model)

    model = model.cuda()
    # build optimizer
    optimizer = build_optimizer(model, cfg.optimizer)

    # Initialize mixed-precision training
    if cfg.apex.use_mixed_precision:
        amp_opt_level = 'O1' if cfg.apex.type == "float16" else 'O0'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=amp_opt_level,
                                          loss_scale=cfg.apex.loss_scale)

    # put model on gpus
    model = MMDistributedDataParallel(model)
    # build runner
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level, logger)

    # register optimizer hooks
    if cfg.apex.use_mixed_precision:
        optimizer_config = DistApexOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    logger.info("Register Optimizer Hook...")
    runner.register_training_hooks(cfg.lr_config,
                                   optimizer_config,
                                   cfg.checkpoint_config,
                                   log_config=None)

    # register self-defined logging hooks
    for info in cfg.log_config['hooks']:
        assert isinstance(info, dict) and 'type' in info
        if info['type'] in ['TensorboardLoggerHook']:
            logger.info("Register Tensorboard Logger Hook...")
            runner.register_hook(TensorboardLoggerHook(
                interval=cfg.log_config.interval,
                register_logWithIter_keyword=['loss']),
                                 priority='VERY_LOW')
        if info['type'] in ['TextLoggerHook']:
            logger.info("Register Text Logger Hook...")
            runner.register_hook(TextLoggerHook(
                interval=cfg.log_config.interval, ),
                                 priority='VERY_LOW')

    logger.info("Register SamplerSeed Hook...")
    runner.register_hook(DistSamplerSeedHook())
    logger.info("Register EmptyCache Hook...")
    runner.register_hook(EmptyCacheHook(before_epoch=True,
                                        after_iter=False,
                                        after_epoch=True),
                         priority='VERY_LOW')

    # register eval hooks
    if validate:
        interval = cfg.get('validate_interval', 1)
        if eval_dataset is not None:
            logger.info("Register Evaluation Hook...")
            runner.register_hook(
                DistStereoEvalHook(cfg, eval_dataset, interval))
        if vis_dataset is not None:
            logger.info("Register Visualization hook...")
            runner.register_hook(DistStereoVisHook(vis_dataset, cfg, interval))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Example #30
0
def _non_dist_train(model,
                    train_dataset,
                    cfg,
                    eval_dataset=None,
                    vis_dataset=None,
                    validate=False,
                    logger=None):
    # prepare data loaders
    data_loaders = [
        build_data_loader(train_dataset,
                          cfg.data.imgs_per_gpu,
                          cfg.data.workers_per_gpu,
                          cfg.gpus,
                          dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level, logger)
    logger.info("Register Optimizer Hook...")
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    logger.info("Register EmptyCache Hook...")
    runner.register_hook(EmptyCacheHook(before_epoch=True,
                                        after_iter=False,
                                        after_epoch=True),
                         priority='VERY_LOW')

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)