Esempio n. 1
0
def train_detector(model,
                   dataset,
                   cfg,
                   distributed=False,
                   validate=False,
                   timestamp=None,
                   meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
    if 'imgs_per_gpu' in cfg.data:
        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
                       'Please use "samples_per_gpu" instead')
        if 'samples_per_gpu' in cfg.data:
            logger.warning(
                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
                f'={cfg.data.imgs_per_gpu} is used in this experiments')
        else:
            logger.warning(
                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
                f'{cfg.data.imgs_per_gpu} in this experiments')
        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # build runner
    # optimizer = build_optimizer(model, cfg.optimizer)

    parameters = []
    for name, p in model.named_parameters():
        if "fc_cls" in name:
            parameters.append(p)
            print("Parameter name:" + name)

    optimizer = torch.optim.SGD(parameters,
                                lr=cfg.optimizer['lr'],
                                momentum=cfg.optimizer['momentum'],
                                weight_decay=cfg.optimizer['weight_decay'])

    runner = EpochBasedRunner(model,
                              optimizer=optimizer,
                              work_dir=cfg.work_dir,
                              logger=logger,
                              meta=meta)
    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    # user-defined hooks
    if cfg.get('custom_hooks', None):
        custom_hooks = cfg.custom_hooks
        assert isinstance(custom_hooks, list), \
            f'custom_hooks expect list type, but got {type(custom_hooks)}'
        for hook_cfg in cfg.custom_hooks:
            assert isinstance(hook_cfg, dict), \
                'Each item in custom_hooks expects dict type, but got ' \
                f'{type(hook_cfg)}'
            hook_cfg = hook_cfg.copy()
            priority = hook_cfg.pop('priority', 'NORMAL')
            hook = build_from_cfg(hook_cfg, HOOKS)
            runner.register_hook(hook, priority=priority)

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 2
0
def train_model(model,
                dataset,
                cfg,
                distributed=False,
                validate=False,
                timestamp=None,
                meta=None):
    """Train model entry function.

    Args:
        model (nn.Module): The model to be trained.
        dataset (:obj:`Dataset`): Train dataset.
        cfg (dict): The config dict for training.
        distributed (bool): Whether to use distributed training.
            Default: False.
        validate (bool): Whether to do evaluation. Default: False.
        timestamp (str | None): Local time for runner. Default: None.
        meta (dict | None): Meta dict to record some important information.
            Default: None
    """
    logger = get_root_logger(log_level=cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    dataloader_setting = dict(videos_per_gpu=cfg.data.get('videos_per_gpu', 1),
                              workers_per_gpu=cfg.data.get(
                                  'workers_per_gpu', 1),
                              num_gpus=len(cfg.gpu_ids),
                              dist=distributed,
                              seed=cfg.seed)
    dataloader_setting = dict(dataloader_setting,
                              **cfg.data.get('train_dataloader', {}))

    if cfg.omnisource:
        # The option can override videos_per_gpu
        train_ratio = cfg.data.get('train_ratio', [1] * len(dataset))
        omni_videos_per_gpu = cfg.data.get('omni_videos_per_gpu', None)
        if omni_videos_per_gpu is None:
            dataloader_settings = [dataloader_setting] * len(dataset)
        else:
            dataloader_settings = []
            for videos_per_gpu in omni_videos_per_gpu:
                this_setting = cp.deepcopy(dataloader_setting)
                this_setting['videos_per_gpu'] = videos_per_gpu
                dataloader_settings.append(this_setting)
        data_loaders = [
            build_dataloader(ds, **setting)
            for ds, setting in zip(dataset, dataloader_settings)
        ]

    else:
        data_loaders = [
            build_dataloader(ds, **dataloader_setting) for ds in dataset
        ]

    # put model on gpus
    if distributed:
        find_unused_parameters = cfg.get('find_unused_parameters', False)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)

    # frozen model for few-shot training, goby
    if cfg.fewshot_training:
        for key, value in model.named_parameters():
            if 'backbone' in key:
                value.requires_grad = False

    # build runner
    Runner = OmniSourceRunner if cfg.omnisource else EpochBasedRunner

    # set different lr to backbone or cls_head, goby
    if cfg.fewshot_training:
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    cfg.optimizer_fewshot_training.lr,
                                    momentum=0.9,
                                    weight_decay=0.0005)
        runner = Runner(model,
                        optimizer=None,
                        work_dir=cfg.work_dir,
                        logger=logger,
                        meta=meta)
        runner.optimizer = optimizer
    else:
        param_group = []
        param_group += [{
            'params': model.module.backbone.parameters(),
            'lr': cfg.optimizer_backbone_training.lr[0]
        }]
        param_group += [{
            'params': model.module.cls_head.parameters(),
            'lr': cfg.optimizer_backbone_training.lr[1]
        }]
        optimizer = torch.optim.SGD(param_group,
                                    cfg.optimizer_backbone_training.lr[1],
                                    momentum=0.9,
                                    weight_decay=0.0005)

        runner = Runner(model,
                        optimizer=None,
                        work_dir=cfg.work_dir,
                        logger=logger,
                        meta=meta)
        runner.optimizer = optimizer

    # an ugly workaround to make .log and .log.json filenames the same
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        if cfg.omnisource:
            runner.register_hook(OmniSourceDistSamplerSeedHook())
        else:
            runner.register_hook(DistSamplerSeedHook())

    if validate:
        eval_cfg = cfg.get('evaluation', {})
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        dataloader_setting = dict(
            videos_per_gpu=cfg.data.get('videos_per_gpu', 2),
            workers_per_gpu=cfg.data.get('workers_per_gpu', 0),
            # cfg.gpus will be ignored if distributed
            num_gpus=len(cfg.gpu_ids),
            dist=distributed,
            shuffle=False)
        dataloader_setting = dict(dataloader_setting,
                                  **cfg.data.get('val_dataloader', {}))
        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
        eval_hook = DistEpochEvalHook if distributed else EpochEvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner_kwargs = dict()
    if cfg.omnisource:
        runner_kwargs = dict(train_ratio=train_ratio)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)
Esempio n. 3
0
def train_caption_model(model,
                        dataset,
                        cfg,
                        distributed=False,
                        validate=False,
                        timestamp=None,
                        meta=None):
    logger = get_root_logger(cfg.log_level)

    # prepare data loaders
    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]

    data_loaders = [
        build_dataloader(
            ds,
            cfg.data.samples_per_gpu,
            cfg.data.workers_per_gpu,
            # cfg.gpus will be ignored if distributed
            len(cfg.gpu_ids),
            dist=distributed,
            seed=cfg.seed) for ds in dataset
    ]

    # put model on gpus
    if distributed:
        #find_unused_parameters = cfg.get('find_unused_parameters', False)
        find_unused_parameters = cfg.get('find_unused_parameters', True)
        # Sets the `find_unused_parameters` parameter in
        # torch.nn.parallel.DistributedDataParallel
        model = MMDistributedDataParallel(
            model.cuda(),
            device_ids=[torch.cuda.current_device()],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
        model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
                               device_ids=cfg.gpu_ids)
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print("N_PARAMETERS", n_parameters)
    print('--------------------------------------')
    # build runner
    # AdamW Optimizer
    # TODO -> build_optimizer 구현

    param_dicts = [
        {"names": [n for n, p in model.named_parameters() \
                if "backbone" in n and p.requires_grad],
         "params": [p for n, p in model.named_parameters() \
                if "backbone" in n and p.requires_grad],
         "lr": cfg.lr_dict.lr_backbone},
        {"names": [n for n, p in model.named_parameters() \
                if "backbone" not in n and p.requires_grad],
         "params": [p for n, p in model.named_parameters() \
                if "backbone" not in n and p.requires_grad]},
    ]
    #optimizer = build_optimizer(model, cfg.optimizer)
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=cfg.lr_dict.lr,
                                  weight_decay=cfg.weight_decay)

    # nondistubuted -> TextGenerateRunner
    # distributed -> EpochBasedRunner
    if not distributed:
        runner = TextGenerateRunner(model,
                                    optimizer=optimizer,
                                    work_dir=cfg.work_dir,
                                    logger=logger,
                                    meta=meta)
        # default 50 batch 마다 하나의 샘플에 대해서 문장 생성함
        runner.set_gen_iter(cfg.log_config.interval)
        # set tokenizer for train sample generation
        runner.set_tokenizer(dataset[0].tokenizer)
        # set decoding method for train sample generation
        runner.set_decoding_cfg(cfg.train_cfg.decoding_cfg)
    else:  # distributed
        runner = EpochBasedRunner(model,
                                  optimizer=optimizer,
                                  work_dir=cfg.work_dir,
                                  logger=logger,
                                  meta=meta)

    # an ugly workaround to make .log and .log.json filenames the same
    # TODO -> Docker 시간 설정
    runner.timestamp = timestamp

    # fp16 setting
    fp16_cfg = cfg.get('fp16', None)
    if fp16_cfg is not None:
        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
                                             **fp16_cfg,
                                             distributed=distributed)
    elif distributed and 'type' not in cfg.optimizer_config:
        optimizer_config = OptimizerHook(**cfg.optimizer_config)
    else:
        optimizer_config = cfg.optimizer_config

    # register hooks
    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                   cfg.checkpoint_config, cfg.log_config,
                                   cfg.get('momentum_config', None))
    if distributed:
        runner.register_hook(DistSamplerSeedHook())

    # register eval hooks
    if validate:
        # TODO : Support batch_size > 1 in validation
        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
        if val_samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.val.pipeline = replace_ImageToTensor(
                cfg.data.val.pipeline)
        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
        val_dataloader = build_dataloader(
            val_dataset,
            samples_per_gpu=val_samples_per_gpu,
            workers_per_gpu=cfg.data.workers_per_gpu,
            dist=distributed,
            shuffle=False)
        eval_cfg = cfg.get('evaluation', {})
        eval_hook = DistEvalHook if distributed else EvalHook
        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))

    if cfg.resume_from:
        runner.resume(cfg.resume_from)
    elif cfg.load_from:
        runner.load_checkpoint(cfg.load_from)
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
Esempio n. 4
0
def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    # update configs according to CLI args
    if args.dir is not None:
        if args.dir.startswith('//'):
            cfg.work_dir = args.dir[2:]
        else:
            localhost = get_localhost().split('.')[0]
            # results from server saved to /private
            if 'gpu' in localhost:
                output_dir = '/private/huangchenxi/mmdet/outputs'
            else:
                output_dir = 'work_dirs'

            if args.dir.endswith('-c'):
                args.dir = args.dir[:-2]
                args.resume_from = search_and_delete(os.path.join(
                    output_dir, args.dir),
                                                     prefix=cfg.work_dir,
                                                     suffix=localhost)
            cfg.work_dir += time.strftime("_%m%d_%H%M") + '_' + localhost
            cfg.work_dir = os.path.join(output_dir, args.dir, cfg.work_dir)

    if args.workers_per_gpu != -1:
        cfg.data['workers_per_gpu'] = args.workers_per_gpu

    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    cfg.gpus = args.gpus

    if args.profiler or args.speed:
        cfg.data.imgs_per_gpu = 1

    if cfg.resume_from or cfg.load_from:
        cfg.model['pretrained'] = None

    if args.test:
        cfg.data.train['ann_file'] = cfg.data.val['ann_file']
        cfg.data.train['img_prefix'] = cfg.data.val['img_prefix']

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
        num_gpus = args.gpus
        rank = 0
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        num_gpus = torch.cuda.device_count()
        rank, _ = get_dist_info()

    if cfg.optimizer['type'] == 'SGD':
        cfg.optimizer['lr'] *= num_gpus * cfg.data.imgs_per_gpu / 256
    else:
        cfg.optimizer['lr'] *= ((num_gpus / 8) * (cfg.data.imgs_per_gpu / 2))

    # init logger before other steps
    logger = get_root_logger(nlogger, cfg.log_level)
    if rank == 0:
        logger.set_logger_dir(cfg.work_dir, 'd')
    logger.info("Config: ------------------------------------------\n" +
                cfg.text)
    logger.info('Distributed training: {}'.format(distributed))

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}'.format(args.seed))
        set_random_seed(args.seed)

    model = build_detector(cfg.model,
                           train_cfg=cfg.train_cfg,
                           test_cfg=cfg.test_cfg)
    if rank == 0:
        # describe_vars(model)
        writer = set_writer(cfg.work_dir)
        # try:
        #     # describe_features(model.backbone)
        #     writer.add_graph(model, torch.zeros((1, 3, 800, 800)))
        # except (NotImplementedError, TypeError):
        #     logger.warn("Add graph failed.")
        # except Exception as e:
        #     logger.warn("Add graph failed:", e)

    if not args.graph and not args.profiler and not args.speed:
        if distributed:
            model = MMDistributedDataParallel(model.cuda())
        else:
            model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()

        if isinstance(cfg.data.train, list):
            for t in cfg.data.train:
                logger.info("loading training set: " + str(t.ann_file))
            train_dataset = [build_dataset(t) for t in cfg.data.train]
            CLASSES = train_dataset[0].CLASSES
        else:
            logger.info("loading training set: " +
                        str(cfg.data.train.ann_file))
            train_dataset = build_dataset(cfg.data.train)
            logger.info("{} images loaded!".format(len(train_dataset)))
            CLASSES = train_dataset.CLASSES
        if cfg.checkpoint_config is not None:
            # save mmdet version, config file content and class names in
            # checkpoints as meta data
            cfg.checkpoint_config.meta = dict(mmdet_version=__version__,
                                              config=cfg.text,
                                              CLASSES=CLASSES)
        # add an attribute for visualization convenience
        if hasattr(model, 'module'):
            model.module.CLASSES = CLASSES
        else:
            model.CLASSES = CLASSES
        train_detector(model,
                       train_dataset,
                       cfg,
                       distributed=distributed,
                       validate=args.validate,
                       logger=logger,
                       runner_attr_dict={'task_name': args.dir})
    else:
        from mmcv.runner.checkpoint import load_checkpoint
        from mmdet.datasets import build_dataloader
        from mmdet.core.utils.model_utils import register_hooks
        from mmdet.apis.train import parse_losses

        model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
        if args.profiler == 'test' or args.speed == 'test':
            model.eval()
            dataset = build_dataset(cfg.data.test)
        else:
            model.train()
            dataset = build_dataset(cfg.data.train)

        if cfg.load_from and (args.profiler or args.speed):
            logger.info('load checkpoint from %s', cfg.load_from)
            load_checkpoint(model,
                            cfg.load_from,
                            map_location='cpu',
                            strict=True)

        data_loader = build_dataloader(dataset,
                                       cfg.data.imgs_per_gpu,
                                       cfg.data.workers_per_gpu,
                                       cfg.gpus,
                                       dist=False,
                                       shuffle=False)

        if args.graph:
            id_dict = {}
            for name, parameter in model.named_parameters():
                id_dict[id(parameter)] = name

        for i, data_batch in enumerate(data_loader):
            if args.graph:
                outputs = model(**data_batch)
                loss, log_vars = parse_losses(outputs)
                get_dot = register_hooks(loss, id_dict)
                loss.backward()
                dot = get_dot()
                dot.save('graph.dot')
                break
            elif args.profiler:
                with torch.autograd.profiler.profile(use_cuda=True) as prof:
                    if args.profiler == 'train':
                        outputs = model(**data_batch)
                        loss, log_vars = parse_losses(outputs)
                        loss.backward()
                    else:
                        with torch.no_grad():
                            model(**data_batch, return_loss=False)

                    if i == 20:
                        prof.export_chrome_trace('./trace.json')
                        logger.info(prof)
                        break
            elif args.speed:
                if args.speed == 'train':
                    start = time.perf_counter()
                    outputs = model(**data_batch)
                    loss, log_vars = parse_losses(outputs)
                    loss.backward()
                    torch.cuda.synchronize()
                    end = time.perf_counter()
                else:
                    start = time.perf_counter()
                    with torch.no_grad():
                        model(**data_batch, return_loss=False)
                    end = time.perf_counter()
                logger.info("{:.3f} s/iter, {:.1f} iters/s".format(
                    end - start, 1. / (end - start)))