Ejemplo n.º 1
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         cfg.gpus,
                         dist=False,
                         shuffle=True,
                         replace=getattr(cfg.data, 'sampling_replace', False),
                         seed=cfg.seed,
                         drop_last=getattr(cfg.data, 'drop_last', False))
        for ds in dataset
    ]

    if 'use_fp16' in cfg and cfg.use_fp16 == True:
        raise NotImplementedError('apex do not support non_dist_train!')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    optimizer_config = cfg.optimizer_config
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        priority = hook.pop('priority', None)
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=False, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=False)
        if priority is None:
            runner.register_hook(build_hook(hook, common_params))
        else:
            runner.register_hook(build_hook(hook, common_params), priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Ejemplo n.º 2
0
def _dist_train(model, dataset, cfg, logger=None, timestamp=None, meta=None):
    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(ds,
                         cfg.data.imgs_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True,
                         shuffle=True,
                         replace=getattr(cfg.data, 'sampling_replace', False),
                         seed=cfg.seed,
                         drop_last=getattr(cfg.data, 'drop_last', False),
                         prefetch=cfg.prefetch,
                         img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
    ]
    optimizer = build_optimizer(model, cfg.optimizer)
    if 'use_fp16' in cfg and cfg.use_fp16:
        model, optimizer = apex.amp.initialize(model.cuda(),
                                               optimizer,
                                               opt_level="O1")
        print_log('**** Initializing mixed precision done. ****')

    # put model on gpus
    model = MMDistributedDataParallel(
        model if next(model.parameters()).is_cuda else model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False)

    # build runner
    runner = Runner(model,
                    batch_processor,
                    optimizer,
                    cfg.work_dir,
                    logger=logger,
                    meta=meta)
    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp

    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=True, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=True)
        runner.register_hook(build_hook(hook, common_params))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Ejemplo n.º 3
0
def _non_dist_train(model,
                    dataset,
                    cfg,
                    validate=False,
                    logger=None,
                    timestamp=None,
                    meta=None):

    # use batch size instead of per-gpu batch size
    if getattr(cfg.data, 'batch_size', False):
        cfg.data.imgs_per_gpu = int(cfg.data.batch_size)
        print_log(
            f"Using {cfg.data.imgs_per_gpu} per gpu for batch size {cfg.data.batch_size}")

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.imgs_per_gpu,
            cfg.data.workers_per_gpu,
            cfg.gpus,
            dist=False,
            shuffle=True,
            replace=getattr(cfg.data, 'sampling_replace', False),
            seed=cfg.seed,
            drop_last=getattr(cfg.data, 'drop_last', False),
            prefetch=cfg.prefetch,
            img_norm_cfg=cfg.img_norm_cfg) for ds in dataset
    ]

    if 'use_fp16' in cfg and cfg.use_fp16 == True:
        raise NotImplementedError('apex do not support non_dist_train!')
    # put model on gpus
    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    optimizer_config = NonDistOptimizerHook(**cfg.optimizer_config)

    if not cfg.get('by_iter', False):
        runner = Runner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)
    else:
        runner = IterBasedRunner(
            model,
            optimizer=optimizer,
            work_dir=cfg.work_dir,
            logger=logger,
            meta=meta)

    # an ugly walkaround to make the .log and .log.json filenames the same
    runner.timestamp = timestamp
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)

    # register custom hooks
    for hook in cfg.get('custom_hooks', ()):
        if hook.type == 'DeepClusterHook':
            common_params = dict(dist_mode=False, data_loaders=data_loaders)
        else:
            common_params = dict(dist_mode=False)
        runner.register_hook(build_hook(hook, common_params))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    if not cfg.get('by_iter', False):
        runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
    else:
        runner.run(data_loaders, cfg.workflow, cfg.total_iters)