Esempio n. 1
0
def _dist_train(model, dataset, cfg, logger, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    find_unused_parameters = cfg.get('find_unused_parameters', False)

    # Start: vj changes
    find_unused_parameters = True
    # End:   vj chagnes

    model = MMDistributedDataParallel(
        model.cuda(),
        device_ids=[torch.cuda.current_device()],
        broadcast_buffers=False,
        find_unused_parameters=find_unused_parameters)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
    runner = EpochBasedRunner(model, batch_processor, optimizer, cfg.work_dir,
                              logger)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']:
            runner.register_hook(
                DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5)))
        if cfg.data.val.type == 'AVADataset':
            runner.register_hook(AVADistEvalmAPHook(cfg.data.val))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 2
0
def _dist_train(model, dataset, cfg, validate=False):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    # put model on gpus
    model = MMDistributedDataParallel(model.cuda(),
                                      device_ids=[torch.cuda.current_device()])

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)
    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())
    # register eval hooks
    if validate:
        if cfg.data.val.type in ['RawFramesDataset', 'VideoDataset']:
            runner.register_hook(
                DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5)))
        if cfg.data.val.type == 'AVADataset':
            runner.register_hook(AVADistEvalmAPHook(cfg.data.val))
    # if validate:
    #     if isinstance(model.module, RPN):
    #         # TODO: implement recall hooks for other datasets
    #         runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
    #     else:
    #         if cfg.data.val.type == 'CocoDataset':
    #             runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
    #         else:
    #             runner.register_hook(DistEvalmAPHook(cfg.data.val))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 3
0
def _dist_train(model,
                dataset,
                cfg,
                validate=False,
                logger=None,
                ignores=None):
    # prepare data loaders
    data_loaders = [
        build_dataloader(dataset,
                         cfg.data.videos_per_gpu,
                         cfg.data.workers_per_gpu,
                         dist=True)
    ]
    num_steps_per_epoch = len(data_loaders[0])

    if hasattr(model, 'update_state'):
        model.update_state(num_steps_per_epoch)

    if cfg.load_from:
        load_checkpoint(model,
                        cfg.load_from,
                        strict=False,
                        logger=logger,
                        show_converted=True,
                        ignores=ignores)

        if hasattr(cfg, 'model_partial_init') and cfg.model_partial_init:
            model.reset_weights()

    # put model on gpus
    model = MMDistributedDataParallel(model.cuda())

    # build runner
    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                    cfg.log_level)

    # fix warm-up bug
    if hasattr(cfg.lr_config, 'warmup_iters'):
        if not hasattr(cfg.lr_config, 'by_epoch') or cfg.lr_config.by_epoch:
            cfg.lr_config.warmup_iters *= num_steps_per_epoch

    # register hooks
    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config)
    runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        eval_epoch = cfg.eval_epoch if hasattr(cfg, 'eval_epoch') else 1
        if cfg.data.val.type in [
                'RawFramesDataset', 'StreamDataset', 'VideoDataset'
        ]:
            runner.register_hook(
                DistEvalTopKAccuracyHook(
                    cfg.data.val,
                    eval_epoch,
                    k=(1, 5),
                    num_valid_classes=cfg.data.num_test_classes))
        elif cfg.data.val.type == 'AVADataset':
            runner.register_hook(AVADistEvalmAPHook(cfg.data.val, eval_epoch))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)

    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)