Exemple #1
0
def _build_iter_runner():

    model = Model()
    tmp_dir = tempfile.mkdtemp()

    runner = IterBasedRunner(
        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
    return runner
Exemple #2
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None,
                meta=None):
    """Distributed training function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (:obj:`Dataset`): Train dataset.
        cfg (dict): The config dict for training.
        validate (bool): Whether to do evaluation. Default: False.
        logger (logging.Logger | None): Logger for training. Default: None.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None.
    """
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.samples_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         drop_last=cfg.data.get('drop_last', False),
                         seed=cfg.seed) for ds in dataset
    ]
    # put model on gpus
    find_unused_parameters = cfg.get('find_unused_parameters', False)
    model = DistributedDataParallelWrapper(
        model,
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)

    # build runner
    optimizer = build_optimizers(model, cfg.optimizers)
    runner = IterBasedRunner(model,
                             optimizer=optimizer,
                             work_dir=cfg.work_dir,
                             logger=logger,
                             meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # register hooks
    runner.register_training_hooks(cfg.lr_config,
                                   checkpoint_config=cfg.checkpoint_config,
                                   log_config=cfg.log_config)

    # visual hook
    if cfg.get('visual_config', None) is not None:
        cfg.visual_config['output_dir'] = os.path.join(
            cfg.work_dir, cfg.visual_config['output_dir'])
        runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS))

    # evaluation hook
    if validate and cfg.get('evaluation', None) is not None:
        dataset = build_dataset(cfg.data.val)
        samples_per_gpu = cfg.data.get('val_samples_per_gpu',
                                       cfg.data.samples_per_gpu)
        workers_per_gpu = cfg.data.get('val_workers_per_gpu',
                                       cfg.data.workers_per_gpu)
        data_loader = build_dataloader(dataset,
                                       samples_per_gpu=samples_per_gpu,
                                       workers_per_gpu=workers_per_gpu,
                                       dist=True,
                                       shuffle=False)
        save_path = osp.join(cfg.work_dir, 'val_visuals')
        runner.register_hook(
            DistEvalIterHook(data_loader,
                             save_path=save_path,
                             **cfg.evaluation))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_iters)
Exemple #3
0
def train_segmentor(model,
                    dataset,
                    cfg,
                    distributed=False,
                    validate=False,
                    timestamp=None,
                    meta=None):
    """Launch segmentor training."""
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed,
            drop_last=True) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)

    runner = IterBasedRunner(model=model,
                             batch_processor=None,
                             optimizer=optimizer,
                             work_dir=cfg.work_dir,
                             logger=logger,
                             meta=meta)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))

    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # register eval hooks
    if validate:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=4,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    '''
    runner = EpochBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    '''

    runner = IterBasedRunner(model,
                             optimizer=optimizer,
                             work_dir=cfg.work_dir,
                             logger=logger,
                             meta=meta)

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    #runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    runner.run(data_loaders, cfg, cfg.workflow, cfg.total_iters)
Exemple #5
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                timestamp=None,
                meta=None):
    """Distributed training function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (:obj:`Dataset`): Train dataset.
        cfg (dict): The config dict for training.
        validate (bool): Whether to do evaluation. Default: False.
        logger (logging.Logger | None): Logger for training. Default: None.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None.
    """
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    # step 1: give default values and override (if exist) from cfg.data
    loader_cfg = dict(
        seed=cfg.get('seed'),
        drop_last=False,
        dist=True,
        **({} if torch.__version__ != 'parrots' else dict(
            prefetch_num=2,
            pin_memory=False,
        )),
        **dict((k, cfg.data[k]) for k in [
            'samples_per_gpu',
            'workers_per_gpu',
            'shuffle',
            'seed',
            'drop_last',
            'prefetch_num',
            'pin_memory',
        ] if k in cfg.data))

    # step 2: cfg.data.train_dataloader has highest priority
    train_loader_cfg = dict(loader_cfg, **cfg.data.get('train_dataloader', {}))

    data_loaders = [build_dataloader(ds, **train_loader_cfg) for ds in dataset]

    # put model on gpus
    find_unused_parameters = cfg.get('find_unused_parameters', False)
    model = DistributedDataParallelWrapper(
        model,
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)

    # build runner
    optimizer = build_optimizers(model, cfg.optimizers)
    runner = IterBasedRunner(
        model,
        optimizer=optimizer,
        work_dir=cfg.work_dir,
        logger=logger,
        meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # register hooks
    runner.register_training_hooks(
        cfg.lr_config,
        checkpoint_config=cfg.checkpoint_config,
        log_config=cfg.log_config)

    # visual hook
    if cfg.get('visual_config', None) is not None:
        cfg.visual_config['output_dir'] = os.path.join(
            cfg.work_dir, cfg.visual_config['output_dir'])
        runner.register_hook(mmcv.build_from_cfg(cfg.visual_config, HOOKS))

    # evaluation hook
    if validate and cfg.get('evaluation', None) is not None:
        dataset = build_dataset(cfg.data.val)

        if ('val_samples_per_gpu' in cfg.data
                or 'val_workers_per_gpu' in cfg.data):
            warnings.warn('"val_samples_per_gpu/val_workers_per_gpu" have '
                          'been deprecated. Please use '
                          '"val_dataloader=dict(samples_per_gpu=1)" instead. '
                          'Details see '
                          'https://github.com/open-mmlab/mmediting/pull/201')

        val_loader_cfg = dict(
            loader_cfg,
            shuffle=False,
            drop_last=False,
            **dict((newk, cfg.data[oldk]) for oldk, newk in [
                ('val_samples_per_gpu', 'samples_per_gpu'),
                ('val_workers_per_gpu', 'workers_per_gpu'),
            ] if oldk in cfg.data),
            **cfg.data.get('val_dataloader', {}))

        data_loader = build_dataloader(dataset, **val_loader_cfg)
        save_path = osp.join(cfg.work_dir, 'val_visuals')
        runner.register_hook(
            DistEvalIterHook(
                data_loader, save_path=save_path, **cfg.evaluation))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_iters)
Exemple #6
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        use_ddp_wrapper = cfg.get('use_ddp_wrapper', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        if use_ddp_wrapper:
            mmcv.print_log('Use DDP Wrapper.', 'mmgen')
            model = DistributedDataParallelWrapper(
                model.cuda(),
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
        else:
            model = MMDistributedDataParallel(
                model.cuda(),
                device_ids=[torch.cuda.current_device()],
                broadcast_buffers=False,
                find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    if cfg.optimizer:
        optimizer = build_optimizers(model, cfg.optimizer)
    # In GANs, we allow building optimizer in GAN model.
    else:
        optimizer = None

    # allow users to define the runner
    if cfg.get('runner', None):
        runner = build_runner(
            cfg.runner,
            dict(model=model,
                 optimizer=optimizer,
                 work_dir=cfg.work_dir,
                 logger=logger,
                 meta=meta))
    else:
        runner = IterBasedRunner(model,
                                 optimizer=optimizer,
                                 work_dir=cfg.work_dir,
                                 logger=logger,
                                 meta=meta)
        # set if use dynamic ddp in training
        # is_dynamic_ddp=cfg.get('is_dynamic_ddp', False))
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)

    # In GANs, we can directly optimize parameter in `train_step` function.
    if cfg.get('optimizer_cfg', None) is None:
        optimizer_config = None
    elif fp16_cfg is not None:
        raise NotImplementedError('Fp16 has not been supported.')
        # optimizer_config = Fp16OptimizerHook(
        #     **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
    # default to use OptimizerHook
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # update `out_dir` in  ckpt hook
    if cfg.checkpoint_config is not None:
        cfg.checkpoint_config['out_dir'] = os.path.join(
            cfg.work_dir, cfg.checkpoint_config.get('out_dir', 'ckpt'))

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))

    # # DistSamplerSeedHook should be used with EpochBasedRunner
    # if distributed:
    #     runner.register_hook(DistSamplerSeedHook())

    # In general, we do NOT adopt standard evaluation hook in GAN training.
    # Thus, if you want a eval hook, you need further define the key of
    # 'evaluation' in the config.
    # register eval hooks
    if validate and cfg.get('evaluation', None) is not None:
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        # Support batch_size > 1 in validation
        val_loader_cfg = {
            'samples_per_gpu': 1,
            'shuffle': False,
            'workers_per_gpu': cfg.data.workers_per_gpu,
            **cfg.data.get('val_data_loader', {})
        }
        val_dataloader = build_dataloader(val_dataset,
                                          dist=distributed,
                                          **val_loader_cfg)
        eval_cfg = deepcopy(cfg.get('evaluation'))
        eval_cfg.update(dict(dist=distributed, dataloader=val_dataloader))
        eval_hook = build_from_cfg(eval_cfg, HOOKS)
        priority = eval_cfg.pop('priority', 'NORMAL')
        runner.register_hook(eval_hook, priority=priority)

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_iters)
def test_precise_bn():
    optimizer_cfg = dict(type='SGD',
                         lr=0.01,
                         momentum=0.9,
                         weight_decay=0.0001)

    test_dataset = ExampleDataset()
    loader = DataLoader(test_dataset, batch_size=2)
    model = ExampleModel()
    optimizer = build_optimizer(model, optimizer_cfg)
    logger = get_logger('precise_bn')
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)

    with pytest.raises(AssertionError):
        # num_samples must be larger than 0
        precise_bn_hook = PreciseBNHook(num_samples=-1)
        runner.register_hook(precise_bn_hook)
        runner.run([loader], [('train', 1)])

    with pytest.raises(AssertionError):
        # interval must be larger than 0
        precise_bn_hook = PreciseBNHook(interval=0)
        runner.register_hook(precise_bn_hook)
        runner.run([loader], [('train', 1)])

    with pytest.raises(AssertionError):
        # interval must be larger than 0
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  logger=logger,
                                  max_epochs=1)
        precise_bn_hook = PreciseBNHook(interval=0)
        runner.register_hook(precise_bn_hook)
        runner.run([loader], [('train', 1)])

    with pytest.raises(AssertionError):
        # only support EpochBaseRunner
        runner = IterBasedRunner(model=model,
                                 batch_processor=None,
                                 optimizer=optimizer,
                                 logger=logger,
                                 max_epochs=1)
        precise_bn_hook = PreciseBNHook(interval=2)
        runner.register_hook(precise_bn_hook)
        print_log(runner)
        runner.run([loader], [('train', 1)])

    # test non-DDP model
    test_bigger_dataset = BiggerDataset()
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    loaders = [loader]
    precise_bn_hook = PreciseBNHook(num_samples=4)
    assert precise_bn_hook.num_samples == 4
    assert precise_bn_hook.interval == 1
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)
    runner.register_hook(precise_bn_hook)
    runner.run(loaders, [('train', 1)])

    # test DP model
    test_bigger_dataset = BiggerDataset()
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    loaders = [loader]
    precise_bn_hook = PreciseBNHook(num_samples=4)
    assert precise_bn_hook.num_samples == 4
    assert precise_bn_hook.interval == 1
    model = MMDataParallel(model)
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)
    runner.register_hook(precise_bn_hook)
    runner.run(loaders, [('train', 1)])

    # test model w/ gn layer
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    loaders = [loader]
    precise_bn_hook = PreciseBNHook(num_samples=4)
    assert precise_bn_hook.num_samples == 4
    assert precise_bn_hook.interval == 1
    model = GNExampleModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)
    runner.register_hook(precise_bn_hook)
    runner.run(loaders, [('train', 1)])

    # test model without bn layer
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    loaders = [loader]
    precise_bn_hook = PreciseBNHook(num_samples=4)
    assert precise_bn_hook.num_samples == 4
    assert precise_bn_hook.interval == 1
    model = NoBNExampleModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)
    runner.register_hook(precise_bn_hook)
    runner.run(loaders, [('train', 1)])

    # test how precise it is
    loader = DataLoader(test_bigger_dataset, batch_size=2)
    loaders = [loader]
    precise_bn_hook = PreciseBNHook(num_samples=12)
    assert precise_bn_hook.num_samples == 12
    assert precise_bn_hook.interval == 1
    model = SingleBNModel()
    runner = EpochBasedRunner(model=model,
                              batch_processor=None,
                              optimizer=optimizer,
                              logger=logger,
                              max_epochs=1)
    runner.register_hook(precise_bn_hook)
    runner.run(loaders, [('train', 1)])
    imgs_list = list()
    for loader in loaders:
        for i, data in enumerate(loader):
            imgs_list.append(np.array(data['imgs']))
    mean = np.mean([np.mean(batch) for batch in imgs_list])
    # bassel correction used in Pytorch, therefore ddof=1
    var = np.mean([np.var(batch, ddof=1) for batch in imgs_list])
    assert np.equal(mean, np.array(
        model.bn.running_mean)), (mean, np.array(model.bn.running_mean))
    assert np.equal(var, np.array(
        model.bn.running_var)), (var, np.array(model.bn.running_var))

    @pytest.mark.skipif(not torch.cuda.is_available(),
                        reason='requires CUDA support')
    def test_ddp_model_precise_bn():
        # test DDP model
        test_bigger_dataset = BiggerDataset()
        loader = DataLoader(test_bigger_dataset, batch_size=2)
        loaders = [loader]
        precise_bn_hook = PreciseBNHook(num_samples=5)
        assert precise_bn_hook.num_samples == 5
        assert precise_bn_hook.interval == 1
        model = ExampleModel()
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=True)
        runner = EpochBasedRunner(model=model,
                                  batch_processor=None,
                                  optimizer=optimizer,
                                  logger=logger,
                                  max_epochs=1)
        runner.register_hook(precise_bn_hook)
        runner.run(loaders, [('train', 1)])
Exemple #8
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):

    # use batch size instead of per-gpu batch size
    if getattr(cfg.data, 'batch_size', False):
        cfg.data.imgs_per_gpu = int(cfg.data.batch_size)
        print_log(
            f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}")

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            shuffle=True,
            replace=getattr(cfg.data, 'sampling_replace', False),
            seed=cfg.seed,
            drop_last=getattr(cfg.data, 'drop_last', False),
            prefetch=cfg.prefetch,
            img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
    ]

    if 'use_fp16' in cfg and cfg.use_fp16 == True:
        raise NotImplementedError('apex do not support non_dist_train!')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config)

    if not cfg.get('by_iter', False):
        runner = Runner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)
    else:
        runner = IterBasedRunner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)

    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=False, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=False)
        runner.register_hook(build_hook(hook, common_params))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    if not cfg.get('by_iter', False):
        runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    else:
        runner.run(data_loaders, cfg.workflow, cfg.total_iters)
Exemple #9
0
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    # use batch size instead of per-gpu batch size
    if getattr(cfg.data, 'batch_size', False):
        num_gpus = torch.cuda.device_count()
        cfg.data.imgs_per_gpu = int(cfg.data.batch_size // num_gpus)
        print_log(
            f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}")

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            dist=True,
            shuffle=True,
            replace=getattr(cfg.data, 'sampling_replace', False),
            seed=cfg.seed,
            drop_last=getattr(cfg.data, 'drop_last', False),
            prefetch=cfg.prefetch,
            img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
    ]
    optimizer = build_optimizer(model, cfg.optimizer)
    if 'use_fp16' in cfg and cfg.use_fp16:
        model, optimizer = apex.amp.initialize(
            model.cuda(), optimizer, opt_level="O1")
        print_log('**** Initializing mixed precision done. ****')

    # put model on gpus
    model = MMDistributedDataParallel(
        model if next(model.parameters()).is_cuda else model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False)

    # build runner
    if not cfg.get('by_iter', False):
        runner = Runner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)
    else:
        runner = IterBasedRunner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)

    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    if not cfg.get('by_iter', False):
        runner.register_hook(DistSamplerSeedHook())
    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=True, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=True)
        runner.register_hook(build_hook(hook, common_params))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)

    if not cfg.get('by_iter', False):
        runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    else:
        runner.run(data_loaders, cfg.workflow, cfg.total_iters)