Beispiel #1
0
def _non_dist_train(model, dataset, cfg, validate=False, **kwargs):
    logger = get_root_logger(cfg.log_level)
    # prepare data loaders
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            repeat_samples=cfg.train_cfg.repeat_samples,
            **kwargs)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(
        model,
        #batch_processor=batch_processor,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger)
    #runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
    #                logger)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #2
0
def test_logger(runner, by_epoch, eval_hook_priority):
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(
        data_loader, interval=1, by_epoch=by_epoch, save_best='acc')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_logger')
        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
        runner = EpochBasedRunner(
            model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
        runner.register_logger_hooks(
            dict(
                interval=1,
                hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
        runner.register_timer_hook(dict(type='IterTimerHook'))
        runner.register_hook(eval_hook, priority=eval_hook_priority)
        runner.run([loader], [('train', 1)], 1)

        path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
        with open(path) as fr:
            fr.readline()  # skip first line which is hook_msg
            train_log = json.loads(fr.readline())
            assert train_log['mode'] == 'train' and 'time' in train_log
            val_log = json.loads(fr.readline())
            assert val_log['mode'] == 'val' and 'time' not in val_log
def _build_demo_runner():

    class Model(nn.Module):

        def __init__(self):
            super().__init__()
            self.linear = nn.Linear(2, 1)

        def forward(self, x):
            return self.linear(x)

        def train_step(self, x, optimizer, **kwargs):
            return dict(loss=self(x))

        def val_step(self, x, optimizer, **kwargs):
            return dict(loss=self(x))

    model = Model()

    optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95)

    log_config = dict(
        interval=1, hooks=[
            dict(type='TextLoggerHook'),
        ])

    tmp_dir = tempfile.mkdtemp()
    runner = EpochBasedRunner(
        model=model,
        work_dir=tmp_dir,
        optimizer=optimizer,
        logger=logging.getLogger())

    runner.register_logger_hooks(log_config)
    return runner
Beispiel #4
0
def _non_dist_train(model, dataset, cfg, validate=False, logger=None):
    # prepare data loaders
    sampler_cfg = cfg.data.get('sampler_cfg', None)
    sampler = cfg.data.get('sampler', 'Group')
    data_loaders = [
        build_dataloader(
            dataset,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            sampler=sampler,
            sampler_cfg=sampler_cfg)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir, logger)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(SamplerSeedHook())

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        load_modules = cfg.get('load_modules', [])
        load_certain_checkpoint(runner.model, runner.logger, cfg.load_from, load_modules)
        #runner.load_checkpoint(cfg.load_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #5
0
def test_save_checkpoint():
    model = Model()
    runner = EpochBasedRunner(model=model, logger=logging.getLogger())

    with tempfile.TemporaryDirectory() as root:
        runner.save_checkpoint(root)

        latest_path = osp.join(root, 'latest.pth')
        epoch1_path = osp.join(root, 'epoch_1.pth')

        assert osp.exists(latest_path)
        assert osp.exists(epoch1_path)
        assert osp.realpath(latest_path) == osp.realpath(epoch1_path)

        torch.load(latest_path)
Beispiel #6
0
def test_runner_with_parallel():

    def batch_processor():
        pass

    model = MMDataParallel(OldStyleModel())
    _ = EpochBasedRunner(model, batch_processor, logger=logging.getLogger())

    with pytest.raises(RuntimeError):
        # batch_processor and train_step() cannot be both set

        def batch_processor():
            pass

        model = MMDataParallel(Model())
        _ = EpochBasedRunner(
            model, batch_processor, logger=logging.getLogger())
Beispiel #7
0
def _build_epoch_runner():

    model = Model()
    tmp_dir = tempfile.mkdtemp()

    runner = EpochBasedRunner(
        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
    return runner
Beispiel #8
0
def train_source_detector(model,
                          dataset,
                          cfg,
                          distributed=False,
                          timestamp=None,
                          meta=None):
    # dataset: [src_dataset]
    logger = get_root_logger(cfg.log_level)
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    model = MMDataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build model optimizer
    optimizer = build_optimizer(model, cfg.optimizer)

    # build runner
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    runner.timestamp = timestamp

    # register hooks; no momentum_config; optimizer_config is required by EpochBasedRunner
    runner.register_training_hooks(cfg.lr_config,
                                   cfg.optimizer_config,
                                   checkpoint_config=cfg.checkpoint_config,
                                   log_config=cfg.log_config)

    # if distributed:
    #     runner.register_hook(DistSamplerSeedHook())
    # if cfg.resume_from:
    #     runner.resume(cfg.resume_from)

    if cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #9
0
def _non_dist_train(model, dataset, cfg, logger, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False)
    ]
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
    # build runner
    optimizer = build_optimizer(cfg.optimizer)
    runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir,
                              logger)
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #10
0
def test_epoch_based_runner():

    with pytest.warns(UserWarning):
        # batch_processor is deprecated
        model = OldStyleModel()

        def batch_processor():
            pass

        _ = EpochBasedRunner(model, batch_processor)

    with pytest.raises(TypeError):
        # batch_processor must be callable
        model = OldStyleModel()
        _ = EpochBasedRunner(model, batch_processor=0)

    with pytest.raises(AssertionError):
        # model must implement the method train_step()
        model = OldStyleModel()
        _ = EpochBasedRunner(model)

    with pytest.raises(TypeError):
        # work_dir must be a str or None
        model = Model()
        _ = EpochBasedRunner(model, work_dir=1)

    with pytest.raises(RuntimeError):
        # batch_processor and train_step() cannot be both set

        def batch_processor():
            pass

        model = Model()
        _ = EpochBasedRunner(model, batch_processor)

    # test work_dir
    model = Model()
    temp_root = tempfile.gettempdir()
    dir_name = ''.join(
        [random.choice(string.ascii_letters) for _ in range(10)])
    work_dir = osp.join(temp_root, dir_name)
    _ = EpochBasedRunner(model, work_dir=work_dir)
    assert osp.isdir(work_dir)
    _ = EpochBasedRunner(model, work_dir=work_dir)
    assert osp.isdir(work_dir)
    os.removedirs(work_dir)
Beispiel #11
0
def _build_demo_runner():
    class Model(nn.Module):
        def __init__(self):
            super().__init__()
            self.linear = nn.Linear(2, 1)

        def forward(self, x):
            return self.linear(x)

        def train_step(self, x, optimizer, **kwargs):
            return dict(loss=self(x))

        def val_step(self, x, optimizer, **kwargs):
            return dict(loss=self(x))

    model = Model()
    tmp_dir = tempfile.mkdtemp()

    runner = EpochBasedRunner(model=model,
                              work_dir=tmp_dir,
                              logger=logging.getLogger())
    return runner
Beispiel #12
0
 def test_ddp_model_precise_bn():
     # test DDP model
     test_bigger_dataset = BiggerDataset()
     loader = DataLoader(test_bigger_dataset, batch_size=2)
     precise_bn_hook = PreciseBNHook(loader, num_iters=5)
     assert precise_bn_hook.num_iters == 5
     assert precise_bn_hook.interval == 1
     model = ExampleModel()
     model = MMDistributedDataParallel(
         model.cuda(),
         device_ids=[torch.cuda.current_device()],
         broadcast_buffers=False,
         find_unused_parameters=True)
     runner = EpochBasedRunner(model=model,
                               batch_processor=None,
                               optimizer=optimizer,
                               logger=logger)
     runner.register_hook(precise_bn_hook)
     runner.run([loader], [('train', 1)], 1)
Beispiel #13
0
def test_save_checkpoint():
    model = Model()
    runner = EpochBasedRunner(model=model, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # meta should be None or dict
        runner.save_checkpoint('.', meta=list())

    with tempfile.TemporaryDirectory() as root:
        runner.save_checkpoint(root)

        latest_path = osp.join(root, 'latest.pth')
        epoch1_path = osp.join(root, 'epoch_1.pth')

        assert osp.exists(latest_path)
        assert osp.exists(epoch1_path)
        assert osp.realpath(latest_path) == osp.realpath(epoch1_path)

        torch.load(latest_path)
Beispiel #14
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # if just swa training is performed,
    # skip building the runner for the traditional training
    if not cfg.get('only_swa_training', False):
        # build runner
        optimizer = build_optimizer(model, cfg.optimizer)
        runner = EpochBasedRunner(model,
                                  optimizer=optimizer,
                                  work_dir=cfg.work_dir,
                                  logger=logger,
                                  meta=meta)
        # an ugly workaround to make .log and .log.json filenames the same
        runner.timestamp = timestamp

        # fp16 setting
        fp16_cfg = cfg.get('fp16', None)
        if fp16_cfg is not None:
            optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                                 **fp16_cfg,
                                                 distributed=distributed)
        elif distributed and 'type' not in cfg.optimizer_config:
            optimizer_config = OptimizerHook(**cfg.optimizer_config)
        else:
            optimizer_config = cfg.optimizer_config

        # register hooks
        runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                       cfg.checkpoint_config, cfg.log_config,
                                       cfg.get('momentum_config', None))
        if distributed:
            runner.register_hook(DistSamplerSeedHook())

        # register eval hooks
        if validate:
            # Support batch_size > 1 in validation
            val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
            if val_samples_per_gpu > 1:
                # Replace 'ImageToTensor' to 'DefaultFormatBundle'
                cfg.data.val.pipeline = replace_ImageToTensor(
                    cfg.data.val.pipeline)
            val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
            val_dataloader = build_dataloader(
                val_dataset,
                samples_per_gpu=val_samples_per_gpu,
                workers_per_gpu=cfg.data.workers_per_gpu,
                dist=distributed,
                shuffle=False)
            eval_cfg = cfg.get('evaluation', {})
            eval_hook = DistEvalHook if distributed else EvalHook
            runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

        # user-defined hooks
        if cfg.get('custom_hooks', None):
            custom_hooks = cfg.custom_hooks
            assert isinstance(custom_hooks, list), \
                f'custom_hooks expect list type, but got {type(custom_hooks)}'
            for hook_cfg in cfg.custom_hooks:
                assert isinstance(hook_cfg, dict), \
                    'Each item in custom_hooks expects dict type, but got ' \
                    f'{type(hook_cfg)}'
                hook_cfg = hook_cfg.copy()
                priority = hook_cfg.pop('priority', 'NORMAL')
                hook = build_from_cfg(hook_cfg, HOOKS)
                runner.register_hook(hook, priority=priority)

        if cfg.resume_from:
            runner.resume(cfg.resume_from)
        elif cfg.load_from:
            runner.load_checkpoint(cfg.load_from)
        runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    else:
        # if just swa training is performed, there should be a starting model
        assert cfg.swa_resume_from is not None or cfg.swa_load_from is not None

    # perform swa training
    # build swa training runner
    if not cfg.get('swa_training', False):
        return
    from mmdet.core import SWAHook
    logger.info('Start SWA training')
    swa_optimizer = build_optimizer(model, cfg.swa_optimizer)
    swa_runner = EpochBasedRunner(model,
                                  optimizer=swa_optimizer,
                                  work_dir=cfg.work_dir,
                                  logger=logger,
                                  meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    swa_runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        swa_optimizer_config = Fp16OptimizerHook(**cfg.swa_optimizer_config,
                                                 **fp16_cfg,
                                                 distributed=distributed)
    elif distributed and 'type' not in cfg.swa_optimizer_config:
        swa_optimizer_config = OptimizerHook(**cfg.swa_optimizer_config)
    else:
        swa_optimizer_config = cfg.swa_optimizer_config

    # register hooks
    swa_runner.register_training_hooks(cfg.swa_lr_config, swa_optimizer_config,
                                       cfg.swa_checkpoint_config,
                                       cfg.log_config,
                                       cfg.get('momentum_config', None))
    if distributed:
        swa_runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        swa_runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
        swa_eval = True
        swa_eval_hook = eval_hook(val_dataloader, **eval_cfg)
    else:
        swa_eval = False
        swa_eval_hook = None

    # register swa hook
    swa_hook = SWAHook(swa_eval=swa_eval, eval_hook=swa_eval_hook)
    swa_runner.register_hook(swa_hook, priority='LOW')

    # register user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            swa_runner.register_hook(hook, priority=priority)

    if cfg.swa_resume_from:
        swa_runner.resume(cfg.swa_resume_from)
    elif cfg.swa_load_from:
        # use the best pretrained model as the starting model for swa training
        if cfg.swa_load_from == 'best_bbox_mAP.pth':
            best_model_path = os.path.join(cfg.work_dir, cfg.swa_load_from)
            assert os.path.exists(best_model_path)
            # avoid the best pretrained model being overwritten
            new_best_model_path = os.path.join(cfg.work_dir,
                                               'best_bbox_mAP_pretrained.pth')
            os.rename(best_model_path, new_best_model_path)
            cfg.swa_load_from = new_best_model_path
        swa_runner.load_checkpoint(cfg.swa_load_from)

    swa_runner.run(data_loaders, cfg.workflow, cfg.swa_total_epochs)
Beispiel #15
0
def test_eval_hook():
    with pytest.raises(AssertionError):
        # `save_best` should be a str
        test_dataset = Model()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best=True)

    with pytest.raises(TypeError):
        # dataloader must be a pytorch DataLoader
        test_dataset = Model()
        data_loader = [DataLoader(test_dataset)]
        EvalHook(data_loader)

    with pytest.raises(ValueError):
        # key_indicator must be valid when rule_map is None
        test_dataset = ExampleDataset()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best='unsupport')

    with pytest.raises(KeyError):
        # rule must be in keys of rule_map
        test_dataset = ExampleDataset()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best='auto', rule='unsupport')

    # if eval_res is an empty dict, print a warning information
    with pytest.warns(UserWarning) as record_warnings:

        class _EvalDataset(ExampleDataset):
            def evaluate(self, results, logger=None):
                return {}

        test_dataset = _EvalDataset()
        data_loader = DataLoader(test_dataset)
        eval_hook = EvalHook(data_loader, save_best='auto')
        runner = _build_epoch_runner()
        runner.register_hook(eval_hook)
        runner.run([data_loader], [('train', 1)], 1)
    # Since there will be many warnings thrown, we just need to check if the
    # expected exceptions are thrown
    expected_message = ('Since `eval_res` is an empty dict, the behavior to '
                        'save the best checkpoint will be skipped in this '
                        'evaluation.')
    for warning in record_warnings:
        if str(warning.message) == expected_message:
            break
    else:
        assert False

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset)
    model = Model()
    data_loader = DataLoader(test_dataset)
    eval_hook = EvalHook(data_loader, save_best=None)

    with tempfile.TemporaryDirectory() as tmpdir:

        # total_epochs = 1
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 1)
        test_dataset.evaluate.assert_called_with(test_dataset,
                                                 [torch.tensor([1])],
                                                 logger=runner.logger)
        assert runner.meta is None or 'best_score' not in runner.meta[
            'hook_msgs']
        assert runner.meta is None or 'best_ckpt' not in runner.meta[
            'hook_msgs']

    # when `save_best` is set to 'auto', first metric will be used.
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='auto')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best acc and corresponding epoch
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='acc')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best loss_top and corresponding epoch
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3

    # total_epochs = 8, return the best score and corresponding epoch
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader,
                         interval=1,
                         save_best='score',
                         rule='greater')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best score using less compare func
    # and indicate corresponding epoch
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, save_best='acc', rule='less')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3

    # Test the EvalHook when resume happened
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, save_best='acc')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 2)

        old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
        assert osp.exists(old_ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 4

        resume_from = old_ckpt_path
        loader = DataLoader(ExampleDataset())
        eval_hook = EvalHook(data_loader, save_best='acc')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)

        runner.resume(resume_from)
        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
        assert osp.exists(old_ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 4

        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7
        assert not osp.exists(old_ckpt_path)

    # test EvalHook with customer test_fn and greater/less keys
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())

    eval_hook = EvalHook(data_loader,
                         save_best='acc',
                         test_fn=mock.MagicMock(return_value={}),
                         greater_keys=[],
                         less_keys=['acc'])

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3

    # test EvalHook with specified `out_dir`
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    out_dir = 's3://user/data'
    eval_hook = EvalHook(data_loader,
                         interval=1,
                         save_best='auto',
                         out_dir=out_dir)

    with patch.object(PetrelBackend, 'put') as mock_put, \
         patch.object(PetrelBackend, 'remove') as mock_remove, \
         patch.object(PetrelBackend, 'isfile') as mock_isfile, \
         tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
        ckpt_path = f'{out_dir}/{basename}/best_acc_epoch_4.pth'

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert runner.meta['hook_msgs']['best_score'] == 7

    assert mock_put.call_count == 3
    assert mock_remove.call_count == 2
    assert mock_isfile.call_count == 2
Beispiel #16
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """Train model entry function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (:obj:`Dataset`): Train dataset.
        cfg (dict): The config dict for training.
        distributed (bool): Whether to use distributed training.
            Default: False.
        validate (bool): Whether to do evaluation. Default: False.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None
    """
    logger = get_root_logger(log_level=cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    dataloader_setting = dict(
        videos_per_gpu=cfg.data.get('videos_per_gpu', {}),
        workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
        # cfg.gpus will be ignored if distributed
        num_gpus=len(cfg.gpu_ids),
        dist=distributed,
        seed=cfg.seed)
    dataloader_setting = dict(dataloader_setting,
                              **cfg.data.get('train_dataloader', {}))

    data_loaders = [
        build_dataloader(ds, **dataloader_setting) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    if validate:
        eval_cfg = cfg.get('evaluation', {})
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        dataloader_setting = dict(
            videos_per_gpu=cfg.data.get('videos_per_gpu', {}),
            workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
            # cfg.gpus will be ignored if distributed
            num_gpus=len(cfg.gpu_ids),
            dist=distributed,
            shuffle=False)
        dataloader_setting = dict(dataloader_setting,
                                  **cfg.data.get('val_dataloader', {}))
        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #17
0
def test_eval_hook():
    with pytest.raises(TypeError):
        # dataloader must be a pytorch DataLoader
        test_dataset = ExampleModel()
        data_loader = [
            DataLoader(test_dataset,
                       batch_size=1,
                       sampler=None,
                       num_worker=0,
                       shuffle=False)
        ]
        EvalHook(data_loader)

    with pytest.raises(ValueError):
        # when `save_best` is True, `key_indicator` should not be None
        test_dataset = ExampleModel()
        data_loader = DataLoader(test_dataset,
                                 batch_size=1,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        EvalHook(data_loader, key_indicator=None)

    with pytest.raises(KeyError):
        # rule must be in keys of rule_map
        test_dataset = ExampleModel()
        data_loader = DataLoader(test_dataset,
                                 batch_size=1,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        EvalHook(data_loader, save_best=False, rule='unsupport')

    with pytest.raises(ValueError):
        # key_indicator must be valid when rule_map is None
        test_dataset = ExampleModel()
        data_loader = DataLoader(test_dataset,
                                 batch_size=1,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        EvalHook(data_loader, key_indicator='unsupport')

    optimizer_cfg = dict(type='SGD',
                         lr=0.01,
                         momentum=0.9,
                         weight_decay=0.0001)

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset, batch_size=1)
    model = ExampleModel()
    optimizer = build_optimizer(model, optimizer_cfg)

    data_loader = DataLoader(test_dataset, batch_size=1)
    eval_hook = EvalHook(data_loader, save_best=False)
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 1)

        best_json_path = osp.join(tmpdir, 'best.json')
        assert not osp.exists(best_json_path)

    loader = DataLoader(EvalDataset(), batch_size=1)
    model = ExampleModel()
    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHook(data_loader,
                         interval=1,
                         save_best=True,
                         key_indicator='acc')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        best_json_path = osp.join(tmpdir, 'best.json')
        best_json = mmcv.load(best_json_path)
        real_path = osp.join(tmpdir, 'epoch_4.pth')

        assert best_json['best_ckpt'] == osp.realpath(real_path)
        assert best_json['best_score'] == 7
        assert best_json['key_indicator'] == 'acc'

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHook(data_loader,
                         interval=1,
                         save_best=True,
                         key_indicator='score',
                         rule='greater')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        best_json_path = osp.join(tmpdir, 'best.json')
        best_json = mmcv.load(best_json_path)
        real_path = osp.join(tmpdir, 'epoch_4.pth')

        assert best_json['best_ckpt'] == osp.realpath(real_path)
        assert best_json['best_score'] == 7
        assert best_json['key_indicator'] == 'score'

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHook(data_loader, rule='less', key_indicator='acc')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        best_json_path = osp.join(tmpdir, 'best.json')
        best_json = mmcv.load(best_json_path)
        real_path = osp.join(tmpdir, 'epoch_6.pth')

        assert best_json['best_ckpt'] == osp.realpath(real_path)
        assert best_json['best_score'] == -3
        assert best_json['key_indicator'] == 'acc'

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHook(data_loader, key_indicator='acc')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 2)

        best_json_path = osp.join(tmpdir, 'best.json')
        best_json = mmcv.load(best_json_path)
        real_path = osp.join(tmpdir, 'epoch_2.pth')

        assert best_json['best_ckpt'] == osp.realpath(real_path)
        assert best_json['best_score'] == 4
        assert best_json['key_indicator'] == 'acc'

        resume_from = osp.join(tmpdir, 'latest.pth')
        loader = DataLoader(ExampleDataset(), batch_size=1)
        eval_hook = EvalHook(data_loader, key_indicator='acc')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.resume(resume_from)
        runner.run([loader], [('train', 1)], 8)

        best_json_path = osp.join(tmpdir, 'best.json')
        best_json = mmcv.load(best_json_path)
        real_path = osp.join(tmpdir, 'epoch_4.pth')

        assert best_json['best_ckpt'] == osp.realpath(real_path)
        assert best_json['best_score'] == 7
        assert best_json['key_indicator'] == 'acc'
Beispiel #18
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            num_gpus=len(cfg.gpu_ids),
            dist=distributed,
            round_up=True,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        logger.info("Start using FP16 mixed precision training.")
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    lr_config = get_lr_hook(cfg.lr_config, HOOKS)

    # register hooks
    runner.register_training_hooks(lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val)
        val_dataset.test_mode = True
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=cfg.data.samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False,
            round_up=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #19
0
def test_build_lr_momentum_hook():
    model = Model()
    runner = EpochBasedRunner(model=model, logger=logging.getLogger())

    # test policy that is already title
    lr_config = dict(
        policy='CosineAnnealing',
        by_epoch=False,
        min_lr_ratio=0,
        warmup_iters=2,
        warmup_ratio=0.9)
    runner.register_lr_hook(lr_config)
    assert len(runner.hooks) == 1

    # test policy that is already title
    lr_config = dict(
        policy='Cyclic',
        by_epoch=False,
        target_ratio=(10, 1),
        cyclic_times=1,
        step_ratio_up=0.4)
    runner.register_lr_hook(lr_config)
    assert len(runner.hooks) == 2

    # test policy that is not title
    lr_config = dict(
        policy='cyclic',
        by_epoch=False,
        target_ratio=(0.85 / 0.95, 1),
        cyclic_times=1,
        step_ratio_up=0.4)
    runner.register_lr_hook(lr_config)
    assert len(runner.hooks) == 3

    # test policy that is title
    lr_config = dict(
        policy='Step',
        warmup='linear',
        warmup_iters=500,
        warmup_ratio=1.0 / 3,
        step=[8, 11])
    runner.register_lr_hook(lr_config)
    assert len(runner.hooks) == 4

    # test policy that is not title
    lr_config = dict(
        policy='step',
        warmup='linear',
        warmup_iters=500,
        warmup_ratio=1.0 / 3,
        step=[8, 11])
    runner.register_lr_hook(lr_config)
    assert len(runner.hooks) == 5

    # test policy that is already title
    mom_config = dict(
        policy='CosineAnnealing',
        min_momentum_ratio=0.99 / 0.95,
        by_epoch=False,
        warmup_iters=2,
        warmup_ratio=0.9 / 0.95)
    runner.register_momentum_hook(mom_config)
    assert len(runner.hooks) == 6

    # test policy that is already title
    mom_config = dict(
        policy='Cyclic',
        by_epoch=False,
        target_ratio=(0.85 / 0.95, 1),
        cyclic_times=1,
        step_ratio_up=0.4)
    runner.register_momentum_hook(mom_config)
    assert len(runner.hooks) == 7

    # test policy that is already title
    mom_config = dict(
        policy='cyclic',
        by_epoch=False,
        target_ratio=(0.85 / 0.95, 1),
        cyclic_times=1,
        step_ratio_up=0.4)
    runner.register_momentum_hook(mom_config)
    assert len(runner.hooks) == 8
Beispiel #20
0
def test_precise_bn():
    with pytest.raises(TypeError):
        # `data_loader` must be a Pytorch DataLoader
        test_dataset = ExampleModel()
        data_loader = DataLoader(test_dataset,
                                 batch_size=2,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        PreciseBNHook('data_loader')

    optimizer_cfg = dict(type='SGD',
                         lr=0.01,
                         momentum=0.9,
                         weight_decay=0.0001)

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset, batch_size=2)
    model = ExampleModel()
    optimizer = build_optimizer(model, optimizer_cfg)

    data_loader = DataLoader(test_dataset, batch_size=2)
    precise_bn_loader = copy.deepcopy(data_loader)
    logger = get_logger('precise_bn')
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger)

    with pytest.raises(AssertionError):
        # num_iters should be no larget than total
        # iters
        precise_bn_hook = PreciseBNHook(precise_bn_loader, num_iters=5)
        runner.register_hook(precise_bn_hook)
        runner.run([loader], [('train', 1)], 1)

    # test non-DDP model
    test_bigger_dataset = BiggerDataset()
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    precise_bn_hook = PreciseBNHook(loader, num_iters=5)
    assert precise_bn_hook.num_iters == 5
    assert precise_bn_hook.interval == 1
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger)
    runner.register_hook(precise_bn_hook)
    runner.run([loader], [('train', 1)], 1)

    # test model w/ gn layer
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    precise_bn_hook = PreciseBNHook(loader, num_iters=5)
    assert precise_bn_hook.num_iters == 5
    assert precise_bn_hook.interval == 1
    model = GNExampleModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger)
    runner.register_hook(precise_bn_hook)
    runner.run([loader], [('train', 1)], 1)

    # test model without bn layer
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    precise_bn_hook = PreciseBNHook(loader, num_iters=5)
    assert precise_bn_hook.num_iters == 5
    assert precise_bn_hook.interval == 1
    model = NoBNExampleModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger)
    runner.register_hook(precise_bn_hook)
    runner.run([loader], [('train', 1)], 1)

    # test how precise it is
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    precise_bn_hook = PreciseBNHook(loader, num_iters=6)  # run all
    assert precise_bn_hook.num_iters == 6
    assert precise_bn_hook.interval == 1
    model = SingleBNModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger)
    runner.register_hook(precise_bn_hook)
    runner.run([loader], [('train', 1)], 1)
    imgs_list = list()
    for _, data in enumerate(loader):
        imgs_list.append(np.array(data['imgs']))
    mean = np.mean([np.mean(batch) for batch in imgs_list])
    # bassel correction used in Pytorch, therefore ddof=1
    var = np.mean([np.var(batch, ddof=1) for batch in imgs_list])
    assert np.equal(mean, np.array(model.bn.running_mean))
    assert np.equal(var, np.array(model.bn.running_var))

    @pytest.mark.skipif(not torch.cuda.is_available(),
                        reason='requires CUDA support')
    def test_ddp_model_precise_bn():
        # test DDP model
        test_bigger_dataset = BiggerDataset()
        loader = DataLoader(test_bigger_dataset, batch_size=2)
        precise_bn_hook = PreciseBNHook(loader, num_iters=5)
        assert precise_bn_hook.num_iters == 5
        assert precise_bn_hook.interval == 1
        model = ExampleModel()
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=True)
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  logger=logger)
        runner.register_hook(precise_bn_hook)
        runner.run([loader], [('train', 1)], 1)
Beispiel #21
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', True)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #22
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    print(cfg)
    print("-"*20)
    print(cfg.optimizer)
    optimizer = build_optimizer(model, cfg.optimizer)

    runner = EpochBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(
            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #23
0
def train_reid(model,
               dataset,
               cfg,
               distributed=False,
               validate=False,
               timestamp=None,
               meta=None):
    logger = get_root_logger()

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ingored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            sampler=cfg.data.get('sampler', None),
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Set the `find_unused_parameters` parameter in torch's DDP
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    runner.timestamp = timestamp

    # register hooks
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hook
    if validate:
        test_dataset = build_dataset(cfg.data.test)
        test_dataloader = build_dataloader(
            test_dataset,
            samples_per_gpu=cfg.data.samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        runner.register_hook(EvalHook(test_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got: {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'each item in custom hooks expects dict type, but got: ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            logger.info(f'Register custom hook: {hook_cfg.type}')
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #24
0
def _dist_train(model, dataset, cfg, logger, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    find_unused_parameters = cfg.get('find_unused_parameters', False)
    model = MMDistributedDataParallel(
        model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir,
                              logger)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']:
            runner.register_hook(
                DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5)))
        if cfg.data.val.type == 'AVADataset':
            runner.register_hook(AVADistEvalmAPHook(cfg.data.val))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #25
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """Train model entry function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (Dataset): Train dataset.
        cfg (dict): The config dict for training.
        distributed (bool): Whether to use distributed training.
            Default: False.
        validate (bool): Whether to do evaluation. Default: False.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None
    """
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    # step 1: give default values and override (if exist) from cfg.data
    loader_cfg = {
        **dict(
            seed=cfg.get('seed'),
            drop_last=False,
            dist=distributed,
            num_gpus=len(cfg.gpu_ids)),
        **({} if torch.__version__ != 'parrots' else dict(
               prefetch_num=2,
               pin_memory=False,
           )),
        **dict((k, cfg.data[k]) for k in [
                   'samples_per_gpu',
                   'workers_per_gpu',
                   'shuffle',
                   'seed',
                   'drop_last',
                   'prefetch_num',
                   'pin_memory',
                   'persistent_workers',
               ] if k in cfg.data)
    }

    # step 2: cfg.data.train_dataloader has highest priority
    train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {}))

    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]

    # determine whether use adversarial training precess or not
    use_adverserial_train = cfg.get('use_adversarial_train', False)

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', True)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel

        if use_adverserial_train:
            # Use DistributedDataParallelWrapper for adversarial training
            model = DistributedDataParallelWrapper(
                model,
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
        else:
            model = MMDistributedDataParallel(
                model.cuda(),
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(
            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizers(model, cfg.optimizer)

    runner = EpochBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    if use_adverserial_train:
        # The optimizer step process is included in the train_step function
        # of the model, so the runner should NOT include optimizer hook.
        optimizer_config = None
    else:
        # fp16 setting
        fp16_cfg = cfg.get('fp16', None)
        if fp16_cfg is not None:
            optimizer_config = Fp16OptimizerHook(
                **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
        elif distributed and 'type' not in cfg.optimizer_config:
            optimizer_config = OptimizerHook(**cfg.optimizer_config)
        else:
            optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        eval_cfg = cfg.get('evaluation', {})
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        dataloader_setting = dict(
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.get('workers_per_gpu', 1),
            # cfg.gpus will be ignored if distributed
            num_gpus=len(cfg.gpu_ids),
            dist=distributed,
            drop_last=False,
            shuffle=False)
        dataloader_setting = dict(dataloader_setting,
                                  **cfg.data.get('val_dataloader', {}))
        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #26
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """Train model entry function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (:obj:`Dataset`): Train dataset.
        cfg (dict): The config dict for training.
        distributed (bool): Whether to use distributed training.
            Default: False.
        validate (bool): Whether to do evaluation. Default: False.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None
    """
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            samples_per_epoch=cfg.data.get('samples_per_epoch', None),
            dist=distributed,
            seed=cfg.seed,
            persistent_workers=cfg.data.get('persistent_workers', False))
        for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        if find_unused_parameters:
            logger.info('set find_unused_parameters = True in DDP')
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model, device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    optimizer_config = cfg.optimizer_config
    if 'type' not in cfg.optimizer_config:
        optimizer_config.type = 'Fp16OptimizerHook' \
            if fp16_cfg else 'OptimizerHook'
    if fp16_cfg:
        optimizer_config.update(fp16_cfg)
    if 'Fp16' in optimizer_config.type:
        optimizer_config.update(distributed=distributed)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=1,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False,
            persistent_workers=cfg.data.get('persistent_workers', False))
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Beispiel #27
0
def test_eval_hook():
    with pytest.raises(AssertionError):
        # `save_best` should be a str
        test_dataset = Model()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best=True)

    with pytest.raises(TypeError):
        # dataloader must be a pytorch DataLoader
        test_dataset = Model()
        data_loader = [DataLoader(test_dataset)]
        EvalHook(data_loader)

    with pytest.raises(ValueError):
        # key_indicator must be valid when rule_map is None
        test_dataset = ExampleDataset()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best='unsupport')

    with pytest.raises(KeyError):
        # rule must be in keys of rule_map
        test_dataset = Model()
        data_loader = DataLoader(test_dataset)
        EvalHook(data_loader, save_best='auto', rule='unsupport')

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset)
    model = Model()
    data_loader = DataLoader(test_dataset)
    eval_hook = EvalHook(data_loader, save_best=None)

    with tempfile.TemporaryDirectory() as tmpdir:

        # total_epochs = 1
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 1)
        test_dataset.evaluate.assert_called_with(
            test_dataset, [torch.tensor([1])], logger=runner.logger)
        assert runner.meta is None or 'best_score' not in runner.meta[
            'hook_msgs']
        assert runner.meta is None or 'best_ckpt' not in runner.meta[
            'hook_msgs']

    # when `save_best` is set to 'auto', first metric will be used.
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='auto')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best acc and corresponding epoch
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='acc')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best loss_top and corresponding epoch
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3

    # total_epochs = 8, return the best score and corresponding epoch
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(
        data_loader, interval=1, save_best='score', rule='greater')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7

    # total_epochs = 8, return the best score using less compare func
    # and indicate corresponding epoch
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, save_best='acc', rule='less')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3

    # Test the EvalHook when resume happend
    data_loader = DataLoader(EvalDataset())
    eval_hook = EvalHook(data_loader, save_best='acc')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 2)

        old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
        assert osp.exists(old_ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 4

        resume_from = old_ckpt_path
        loader = DataLoader(ExampleDataset())
        eval_hook = EvalHook(data_loader, save_best='acc')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)

        runner.resume(resume_from)
        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
        assert osp.exists(old_ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 4

        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == 7
        assert not osp.exists(old_ckpt_path)

    # test EvalHook with customer test_fn and greater/less keys
    loader = DataLoader(EvalDataset())
    model = Model()
    data_loader = DataLoader(EvalDataset())

    eval_hook = EvalHook(
        data_loader,
        save_best='acc',
        test_fn=mock.MagicMock(return_value={}),
        greater_keys=[],
        less_keys=['acc'])

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
        assert osp.exists(ckpt_path)
        assert runner.meta['hook_msgs']['best_score'] == -3
Beispiel #28
0
    ])
    trainset = CIFAR10(root='data',
                       train=True,
                       download=True,
                       transform=transform)
    trainloader = DataLoader(trainset,
                             batch_size=128,
                             shuffle=True,
                             num_workers=2)

    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    logger = get_logger('mmcv')
    # runner is a scheduler to manage the training
    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir='./work_dir',
                              logger=logger,
                              max_epochs=4)

    # learning rate scheduler config
    lr_config = dict(policy='step', step=[2, 3])
    # configuration of optimizer
    optimizer_config = dict(grad_clip=None)
    # configuration of saving checkpoints periodically
    checkpoint_config = dict(interval=1)
    # save log periodically and multiple hooks can be used simultaneously
    log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
    # register hooks to runner and those hooks will be invoked automatically
    runner.register_training_hooks(lr_config=lr_config,
                                   optimizer_config=optimizer_config,
                                   checkpoint_config=checkpoint_config,
Beispiel #29
0
def test_eval_hook(EvalHookCls):
    with pytest.raises(TypeError):
        # dataloader must be a pytorch DataLoader
        test_dataset = ExampleDataset()
        data_loader = [
            DataLoader(test_dataset,
                       batch_size=1,
                       sampler=None,
                       num_worker=0,
                       shuffle=False)
        ]
        EvalHookCls(data_loader)

    with pytest.raises(KeyError):
        # rule must be in keys of rule_map
        test_dataset = ExampleDataset()
        data_loader = DataLoader(test_dataset,
                                 batch_size=1,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        EvalHookCls(data_loader, save_best='auto', rule='unsupport')

    with pytest.raises(ValueError):
        # key_indicator must be valid when rule_map is None
        test_dataset = ExampleDataset()
        data_loader = DataLoader(test_dataset,
                                 batch_size=1,
                                 sampler=None,
                                 num_workers=0,
                                 shuffle=False)
        EvalHookCls(data_loader, save_best='unsupport')

    optimizer_cfg = dict(type='SGD',
                         lr=0.01,
                         momentum=0.9,
                         weight_decay=0.0001)

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset, batch_size=1)
    model = ExampleModel()
    optimizer = build_optimizer(model, optimizer_cfg)

    data_loader = DataLoader(test_dataset, batch_size=1)
    eval_hook = EvalHookCls(data_loader, save_best=None)
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 1)
        assert runner.meta is None or 'best_score' not in runner.meta[
            'hook_msgs']
        assert runner.meta is None or 'best_ckpt' not in runner.meta[
            'hook_msgs']

    # when `save_best` is set to 'auto', first metric will be used.
    loader = DataLoader(EvalDataset(), batch_size=1)
    model = ExampleModel()
    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHookCls(data_loader, interval=1, save_best='auto')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        real_path = osp.join(tmpdir, 'epoch_4.pth')
        link_path = osp.join(tmpdir, 'best_mAP.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.7

    loader = DataLoader(EvalDataset(), batch_size=1)
    model = ExampleModel()
    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHookCls(data_loader, interval=1, save_best='mAP')

    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        real_path = osp.join(tmpdir, 'epoch_4.pth')
        link_path = osp.join(tmpdir, 'best_mAP.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.7

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHookCls(data_loader,
                            interval=1,
                            save_best='score',
                            rule='greater')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        real_path = osp.join(tmpdir, 'epoch_4.pth')
        link_path = osp.join(tmpdir, 'best_score.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.7

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHookCls(data_loader, save_best='mAP', rule='less')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 8)

        real_path = osp.join(tmpdir, 'epoch_6.pth')
        link_path = osp.join(tmpdir, 'best_mAP.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.05

    data_loader = DataLoader(EvalDataset(), batch_size=1)
    eval_hook = EvalHookCls(data_loader, save_best='mAP')
    with tempfile.TemporaryDirectory() as tmpdir:
        logger = get_logger('test_eval')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.run([loader], [('train', 1)], 2)

        real_path = osp.join(tmpdir, 'epoch_2.pth')
        link_path = osp.join(tmpdir, 'best_mAP.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.4

        resume_from = osp.join(tmpdir, 'latest.pth')
        loader = DataLoader(ExampleDataset(), batch_size=1)
        eval_hook = EvalHookCls(data_loader, save_best='mAP')
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  work_dir=tmpdir,
                                  logger=logger)
        runner.register_checkpoint_hook(dict(interval=1))
        runner.register_hook(eval_hook)
        runner.resume(resume_from)
        runner.run([loader], [('train', 1)], 8)

        real_path = osp.join(tmpdir, 'epoch_4.pth')
        link_path = osp.join(tmpdir, 'best_mAP.pth')

        assert runner.meta['hook_msgs']['best_ckpt'] == osp.realpath(real_path)
        assert osp.exists(link_path)
        assert runner.meta['hook_msgs']['best_score'] == 0.7
Beispiel #30
0
def test_epoch_based_runner():

    with pytest.warns(UserWarning):
        # batch_processor is deprecated
        model = OldStyleModel()

        def batch_processor():
            pass

        _ = EpochBasedRunner(
            model, batch_processor, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # batch_processor must be callable
        model = OldStyleModel()
        _ = EpochBasedRunner(
            model, batch_processor=0, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # optimizer must be a optimizer or a dict of optimizers
        model = Model()
        optimizer = 'NotAOptimizer'
        _ = EpochBasedRunner(
            model, optimizer=optimizer, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # optimizer must be a optimizer or a dict of optimizers
        model = Model()
        optimizers = dict(optim1=torch.optim.Adam(), optim2='NotAOptimizer')
        _ = EpochBasedRunner(
            model, optimizer=optimizers, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # logger must be a logging.Logger
        model = Model()
        _ = EpochBasedRunner(model, logger=None)

    with pytest.raises(TypeError):
        # meta must be a dict or None
        model = Model()
        _ = EpochBasedRunner(model, logger=logging.getLogger(), meta=['list'])

    with pytest.raises(AssertionError):
        # model must implement the method train_step()
        model = OldStyleModel()
        _ = EpochBasedRunner(model, logger=logging.getLogger())

    with pytest.raises(TypeError):
        # work_dir must be a str or None
        model = Model()
        _ = EpochBasedRunner(model, work_dir=1, logger=logging.getLogger())

    with pytest.raises(RuntimeError):
        # batch_processor and train_step() cannot be both set

        def batch_processor():
            pass

        model = Model()
        _ = EpochBasedRunner(
            model, batch_processor, logger=logging.getLogger())

    # test work_dir
    model = Model()
    temp_root = tempfile.gettempdir()
    dir_name = ''.join(
        [random.choice(string.ascii_letters) for _ in range(10)])
    work_dir = osp.join(temp_root, dir_name)
    _ = EpochBasedRunner(model, work_dir=work_dir, logger=logging.getLogger())
    assert osp.isdir(work_dir)
    _ = EpochBasedRunner(model, work_dir=work_dir, logger=logging.getLogger())
    assert osp.isdir(work_dir)
    os.removedirs(work_dir)