Exemple #1
0
def _non_dist_train(model, dataset, cfg, validate=False):

    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    if validate:
        if isinstance(model.module, RPN):
            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
        elif cfg.data.val.type == 'CocoDataset':
            runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
        elif cfg.data.val.type in ['KittiLiDAR', 'KittiRGB']:
            runner.register_hook(KittiEvalmAPHook(cfg.data.val, \
                                                  interval=cfg.eval_interval))
        else:
            runner.register_hook(DistEvalmAPHook(cfg.data.val))
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset_names, cfg, validate=False, **kwargs):

    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.tasks_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         customized_sampler=not dataset.test_mode)
        for dataset in dataset_names
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processors[cfg.batch_processor_type],
                    cfg.optimizer, cfg.work_dir, cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # resume
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **kwargs)
Exemple #4
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            len(cfg.gpus.train),
            dist=False)
    ]
    print('dataloader built')

    # put model on gpus
    model = MMDataParallel(model, device_ids=cfg.gpus.train).cuda()
    print('model paralleled')

    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #5
0
def _non_dist_train(model, dataset_train, dataset_val, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset_train,
                         cfg.imgs_per_gpu,
                         cfg.workers_per_gpu,
                         cfg.gpus.__len__(),
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=cfg.gpus).cuda()
    # build runner
    optimizer = build_optimizer(model,
                                cfg.optimizer)
    runner = Runner(model, batch_processor,
                    optimizer,
                    cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config,
                                   cfg.optimizer_config,
                                   cfg.checkpoint_config,
                                   cfg.log_config)

    if validate:
        print('validate........................')
        runner.register_hook(NonDistEvalHook(dataset_val, cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #6
0
def train(
    work_dir,
    model_cfg,
    loss_cfg,
    dataset_cfg,
    optimizer_cfg,
    total_epochs,
    training_hooks,
    batch_size=None,
    gpu_batch_size=None,
    workflow=[('train', 1)],
    gpus=-1,
    log_level=0,
    workers=4,
    resume_from=None,
    load_from=None,
):

    # calculate batch size
    if gpus < 0:
        gpus = torch.cuda.device_count()
    if (batch_size is None) and (gpu_batch_size is not None):
        batch_size = gpu_batch_size * gpus
    assert batch_size is not None, 'Please appoint batch_size or gpu_batch_size.'

    # prepare data loaders
    if isinstance(dataset_cfg, dict):
        dataset_cfg = [dataset_cfg]
    data_loaders = [
        torch.utils.data.DataLoader(dataset=call_obj(**d),
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=workers,
                                    drop_last=True) for d in dataset_cfg
    ]

    # put model on gpus
    if isinstance(model_cfg, list):
        model = [call_obj(**c) for c in model_cfg]
        model = torch.nn.Sequential(*model)
    else:
        model = call_obj(**model_cfg)
    model.apply(weights_init)

    model = MMDataParallel(model, device_ids=range(gpus)).cuda()
    loss = call_obj(**loss_cfg)

    # build runner
    optimizer = call_obj(params=model.parameters(), **optimizer_cfg)
    runner = Runner(model, batch_processor, optimizer, work_dir, log_level)
    runner.register_training_hooks(**training_hooks)

    if resume_from:
        runner.resume(resume_from)
    elif load_from:
        runner.load_checkpoint(load_from)

    # run
    workflow = [tuple(w) for w in workflow]
    runner.run(data_loaders, workflow, total_epochs, loss=loss)
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                momentum=0.9,
                                weight_decay=0.0001)
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    get_root_logger(cfg.log_level))
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #8
0
def _non_dist_train(model, dataset, cfg, validate=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True)
        for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level, cfg=cfg)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #9
0
def _non_dist_train(model, dataset, cfg, validate=False, **kwargs):
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            #repeat_samples=cfg.train_cfg.repeat_samples,
            **kwargs)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    logger = _init_logger(log_dir=cfg.work_dir, level=getattr(logging, cfg.log_level))
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    logger)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #10
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
        else:
            if cfg.data.val.type == 'CocoDataset':
                runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
            else:
                runner.register_hook(DistEvalmAPHook(cfg.data.val))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #11
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if validate:
        val_dataset_cfg = cfg.data.val
        dataset_type = getattr(datasets, val_dataset_cfg.type)
        if issubclass(dataset_type, datasets.iMaterialistDataset):
            from mmdet.core import iMaterialistEvalmAPHook
            runner.register_hook(iMaterialistEvalmAPHook(val_dataset_cfg))
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #12
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(CfgSaverHook(cfg), priority='VERY_LOW')

    if validate:
        # TODO: should implement dist, nondist version evaluation
        if isinstance(model.module, RPN):
            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
        else:
            if cfg.data.val.type == 'CocoDataset':
                runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
            else:
                runner.register_hook(NonDistEvalmAPHook(cfg.data.val))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #13
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())
    # build runner
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        if cfg.data.val.type in ['RawFramesDataset']:
            runner.register_hook(
                DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5)))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #14
0
def _non_dist_train(model, train_dataset, cfg, validate=False):
    build_fn = lambda dataset, cfg_dict: build_dataloader(
        dataset,
        cfg.data.imgs_per_gpu,
        cfg.data.workers_per_gpu,
        cfg.gpus,
        dist=False,
        balanced=cfg_dict.get('balanced', False))

    datasets = [train_dataset]
    data_loaders = [build_fn(train_dataset, cfg.data.train)]
    if validate:
        val_dataset = get_dataset(cfg.data.val)
        datasets.append(val_dataset)
        data_loaders.append(build_fn(val_dataset, cfg.data.val))
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #15
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # load_checkpoint
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    # training
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #16
0
def _dist_train(model, dataset_train, dataset_val, cfg, validate=False):
    # prepare data loaders
    data_loaders = [build_dataloader(dataset_train,
                                     cfg.imgs_per_gpu,
                                     cfg.workers_per_gpu,
                                     dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda(), find_unused_parameters=True)
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    print('cfg work dir is ', cfg.work_dir)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    cfg.log_level)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config,
                                   optimizer_config,
                                   cfg.checkpoint_config,
                                   cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        print('validate........................')
        interval = cfg.get('validate_interval', 1)
        runner.register_hook(DistEvalMonoHook(dataset_val, interval, cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
def _non_dist_train(model, dataset, cfg, validate=False):
    # ipdb.set_trace()
    # prepare data loaders
    # 返回dataloader的迭代器,采用pytorch的DataLoader方法封装数据集
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus # 这里多GPU输入没用list而是迭代器,注意单GPU是range(0,1),遍历的时候只有0
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner  # 这里只是将相关参数传递,没有任何的构建和运行。就像model的build一样只放了模块没有连接和顺序
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    # 注册钩子
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    # 断点加载或文件加载数据
    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #18
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(
        model,
        batch_processor,
        optimizer,
        cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            imgs_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=False,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(EvalHook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #19
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False) for ds in dataset
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    if hasattr(cfg, 'extra_hooks'):
        import detector_utils.pytorch.utils.mmcv_custom_hooks
        for hook_args in cfg.extra_hooks:
            hook_type_name = hook_args['type']
            del hook_args['type']
            assert hasattr(detector_utils.pytorch.utils.mmcv_custom_hooks, hook_type_name), \
                f"Unknown hook name: {hook_type_name}"
            hook_type = getattr(detector_utils.pytorch.utils.mmcv_custom_hooks,
                                hook_type_name)
            hook = hook_type(**hook_args)
            runner.register_hook(hook)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #20
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         shuffle=True,
                         replace=getattr(cfg.data, 'sampling_replace', False),
                         seed=cfg.seed,
                         drop_last=getattr(cfg.data, 'drop_last', False))
        for ds in dataset
    ]

    if 'use_fp16' in cfg and cfg.use_fp16 == True:
        raise NotImplementedError('apex do not support non_dist_train!')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        priority = hook.pop('priority', None)
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=False, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=False)
        if priority is None:
            runner.register_hook(build_hook(hook, common_params))
        else:
            runner.register_hook(build_hook(hook, common_params), priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #21
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None,
                meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(
        model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        # sometime will have a error, can enabled the next line
        find_unused_parameters=True)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(DistEvalHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #22
0
def train_smpl_detector_fuse(model, datasets, cfg, **kwargs):
    # prepare data loaders
    data_loaders = [
        build_dataloader_fuse(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for dataset in datasets
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    # TODO: Build up a logger here that inherit the hook class
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    val_dataset_cfg = cfg.data.val
    eval_cfg = cfg.get('evaluation', {})
    # No need to add hook for validation.
    # We shall return the losses inside the loss function.
    # I won't re-use the code for the Evaluation Hook.
    # The evaluation result will be passed to log only.

    # To crete a `latest.pth` file to run recursively
    if kwargs.get('create_dummy', False):
        print('Create a dummy checkpoint for recursive training')
        runner._epoch -= 1
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='dummy_{}.pth')
        runner._epoch += 1
        return

    pretrain_path = kwargs.get('load_pretrain', None)
    if kwargs.get('load_pretrain', None):
        print(f"Load pretrained model from {pretrain_path}")
        runner._epoch -= 1
        runner.load_checkpoint(pretrain_path)
        runner.save_checkpoint(cfg.work_dir, filename_tmpl='pretrained_{}.pth')
        runner._epoch += 1
        return

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs,
               time_limit=getattr(cfg, 'time_limit', None))
Exemple #23
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    try:
        selsa_imgs = cfg.data.selsa_imgs
    except Exception:
        selsa_imgs = cfg.data.imgs_per_gpu

    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         selsa_imgs=selsa_imgs,
                         dist=True) for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    mod = cfg.pop('mod', False)
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer, mod)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #24
0
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         shuffle=True,
                         replace=getattr(cfg.data, 'sampling_replace', False),
                         seed=cfg.seed,
                         drop_last=getattr(cfg.data, 'drop_last', False),
                         prefetch=cfg.prefetch,
                         img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
    ]
    optimizer = build_optimizer(model, cfg.optimizer)
    if 'use_fp16' in cfg and cfg.use_fp16:
        model, optimizer = apex.amp.initialize(model.cuda(),
                                               optimizer,
                                               opt_level="O1")
        print_log('**** Initializing mixed precision done. ****')

    # put model on gpus
    model = MMDistributedDataParallel(
        model if next(model.parameters()).is_cuda else model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False)

    # build runner
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=True, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=True)
        runner.register_hook(build_hook(hook, common_params))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #25
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds, cfg.data.imgs_per_gpu, cfg.data.workers_per_gpu, dist=True)
        for ds in dataset
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(
        model, batch_processor, optimizer, cfg.work_dir, logger=logger)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            # TODO: implement recall hooks for other datasets
            runner.register_hook(
                CocoDistEvalRecallHook(val_dataset_cfg, **eval_cfg))
        else:
            dataset_type = DATASETS.get(val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset) or issubclass(dataset_type, datasets.Acoustic):
                runner.register_hook(
                    CocoDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    DistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #26
0
def _non_dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False) for ds in dataset
    ]

    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
                    cfg.log_level)
    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        if isinstance(model.module, RPN):
            raise NotImplementedError
        else:
            dataset_type = getattr(datasets, val_dataset_cfg.type)
            if issubclass(dataset_type, datasets.CocoDataset):
                runner.register_hook(
                    CocoNonDistEvalmAPHook(val_dataset_cfg, **eval_cfg))
            elif issubclass(dataset_type, datasets.SeqVIDDataset):
                runner.register_hook(
                   NonDistSeqEvalmAPHook(val_dataset_cfg, **eval_cfg))
            elif issubclass(dataset_type, datasets.PairVIDDataset) \
                    or issubclass(dataset_type, datasets.TwinVIDDataset):
                runner.register_hook(
                    NonDistPairEvalmAPHook(val_dataset_cfg, **eval_cfg))
            else:
                runner.register_hook(
                    NonDistEvalmAPHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Exemple #27
0
def train(
    work_dir,
    model_cfg,
    loss_cfg,
    dataset_cfg,
    optimizer_cfg,
    batch_size,
    total_epochs,
    training_hooks,
    workflow=[('train', 1)],
    gpus=1,
    log_level=0,
    workers=2,
    resume_from=None,
    load_from=None,
):

    # prepare data loaders
    if isinstance(dataset_cfg, dict):
        dataset_cfg = [dataset_cfg]
    data_loaders = [
        torch.utils.data.DataLoader(dataset=call_obj(**d),
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=workers,
                                    drop_last=True) for d in dataset_cfg
    ]

    # put model on gpus
    if isinstance(model_cfg, list):
        model = [call_obj(**c) for c in model_cfg]
        model = torch.nn.Sequential(*model)
    else:
        model = call_obj(**model_cfg)
    model.apply(weights_init)
    print("Model size: ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))
    model = MMDataParallel(model, device_ids=range(gpus)).cuda()
    loss = call_obj(**loss_cfg)

    # build runner
    optimizer = call_obj(params=model.parameters(), **optimizer_cfg)
    runner = Runner(model, batch_processor, optimizer, work_dir, log_level)
    runner.register_training_hooks(**training_hooks)

    if resume_from:
        runner.resume(resume_from)
    elif load_from:
        runner.load_checkpoint(load_from)

    # run
    workflow = [tuple(w) for w in workflow]
    runner.run(data_loaders, workflow, total_epochs, loss=loss)

    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Exemple #28
0
def _dist_train(model, dataset, cfg, validate=False):
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer,
                                cfg.get('optimizer_exclude_arch'))

    arch_name = None
    optimizer_arch = None
    if 'optimizer_arch' in cfg:
        raise NotImplementedError

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    optimizer_arch,
                    cfg.work_dir,
                    cfg.log_level,
                    arch_name=arch_name)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg)
    else:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
        optimizer_arch_config = DistOptimizerArchHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   optimizer_arch_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        val_dataset_cfg = cfg.data.val
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(DistEvalHook(val_dataset_cfg, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    if 'optimizer_arch' in cfg:
        raise NotImplementedError
    else:
        data_loaders = [
            build_dataloader(dataset,
                             cfg.data.imgs_per_gpu,
                             cfg.data.workers_per_gpu,
                             dist=True)
        ]
        runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)
Exemple #29
0
def train(cfg_path, dataset_class):
    """借用mmcv的Runner框架进行训练,包括里边的hooks作为lr更新,loss计算的工具
    1. dataset的数据集输出打包了img/gt_bbox/label/,采用DataContainer封装
    2. Dataloader的default_collate用定制collate替换,从而支持dataset的多类型数据
    3. DataParallel外壳用定制MMDataparallel替换,从而支持DataContainer
    """
    # 初始化2个默认选项
    distributed = False
    parallel = True

    # get cfg
    cfg = Config.fromfile(cfg_path)

    # set backends
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # get logger
    logger = get_root_logger(cfg.log_level)
    logger.info('Distributed training: {}'.format(distributed))
    logger.info('DataParallel training: {}'.format(parallel))
    # build model & detector
    #    model = M2detDetector(cfg)
    model = M2detDetector(cfg)
    if not parallel:
        model = model.cuda()
    else:
        model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # prepare data & dataloader
    # Runner要求dataloader放在list里: 使workflow里每个flow对应一个dataloader
    dataset = get_dataset(cfg.data.train, dataset_class)
    batch_size = cfg.gpus * cfg.data.imgs_per_gpu
    num_workers = cfg.gpus * cfg.data.workers_per_gpu
    dataloader = [
        DataLoader(dataset,
                   batch_size=batch_size,
                   sampler=GroupSampler(dataset, cfg.data.imgs_per_gpu),
                   num_workers=num_workers,
                   collate_fn=partial(collate,
                                      samples_per_gpu=cfg.data.imgs_per_gpu),
                   pin_memory=False)
    ]

    # define runner and running type(1.resume, 2.load, 3.train/test)
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    if cfg.resume_from:  # 恢复训练: './work_dirs/ssd300_voc/latest.pth'
        runner.resume(cfg.resume_from,
                      map_location=lambda storage, loc: storage)
    elif cfg.load_from:  # 加载参数进行测试
        runner.load_checkpoint(cfg.load_from)
    # 开始训练: 采用workflow来区分train还是test
    runner.run(dataloader, cfg.workflow, cfg.total_epochs)
Exemple #30
0
def _non_dist_train(model, dataset, cfg, validate=False):
    if validate:
        raise NotImplementedError('Built-in validation is not implemented '
                                  'yet in not-distributed training. Use '
                                  'distributed training or test.py and '
                                  '*eval.py scripts instead.')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer,
                                cfg.get('optimizer_exclude_arch'))

    arch_name = None
    optimizer_arch = None
    if 'optimizer_arch' in cfg:
        raise NotImplementedError

    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    optimizer_arch,
                    cfg.work_dir,
                    cfg.log_level,
                    arch_name=arch_name)

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=False)
    else:
        optimizer_config = cfg.optimizer_config
        optimizer_arch_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   optimizer_arch_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    if 'optimizer_arch' in cfg:
        raise NotImplementedError
    else:
        data_loaders = [
            build_dataloader(dataset,
                             cfg.data.imgs_per_gpu,
                             cfg.data.workers_per_gpu,
                             cfg.gpus,
                             dist=False)
        ]
        runner.run(data_loaders, None, cfg.workflow, cfg.total_epochs)