def main(gpu, ngpus, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare for training
    model_cfg, eval_dir, logger = eval_utils.prepare_environment(args, cfg)
    model, ckp_manager = eval_utils.build_model(model_cfg, cfg['model'],
                                                eval_dir, args, logger)
    optimizer, scheduler = main_utils.build_optimizer(model.parameters(),
                                                      cfg['optimizer'], logger)
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], cfg['num_workers'], False, logger)

    # Optionally resume from a checkpoint
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume'] or cfg['test_only']:
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True,
                                          logger=logger)

    ######################### TRAINING #########################
    if not cfg['test_only']:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)
        for epoch in range(start_epoch, end_epoch):
            train_loader.dataset.shuffle_dataset()
            test_loader.dataset.shuffle_dataset()

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger)
            top1, _ = run_phase('test', test_loader, model, None, epoch, args,
                                cfg, logger)
            ckp_manager.save(model,
                             optimizer,
                             scheduler,
                             epoch,
                             eval_metric=top1)
            scheduler.step()

    ######################### TESTING #########################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    cfg['dataset']['test']['clips_per_video'] = 25
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], cfg['num_workers'], False, logger)
    top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args,
                           cfg, logger)
    top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None,
                                       end_epoch, args, cfg, logger)

    ######################### LOG RESULTS #########################
    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    logger.add_line('Clip@1: {:6.2f}'.format(top1))
    logger.add_line('Clip@5: {:6.2f}'.format(top5))
    logger.add_line('Video@1: {:6.2f}'.format(top1_dense))
    logger.add_line('Video@5: {:6.2f}'.format(top5_dense))
def main_worker(gpu, ngpus_per_node, args, cfg):
    args.gpu = gpu

    # Setup environment
    args = main_utils.initialize_distributed_backend(args, ngpus_per_node)
    logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg)

    # Define model
    model = main_utils.build_model(cfg['model'], logger)
    model, args, cfg['dataset']['batch_size'], cfg['num_workers'] = main_utils.distribute_model_to_cuda(model, args, cfg['dataset']['batch_size'], cfg['num_workers'], ngpus_per_node)

    # Define dataloaders
    train_loader, test_loader = main_utils.build_dataloaders(cfg['dataset'], cfg['num_workers'], args.distributed, logger)

    # Define criterion
    train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger).cuda(gpu)

    # Define optimizer
    optimizer, scheduler = main_utils.build_optimizer(
        params=list(model.parameters()) + list(train_criterion.parameters()),
        cfg=cfg['optimizer'],
        logger=logger)
    ckp_manager = main_utils.CheckpointManager(model_dir, rank=args.rank)

    # Optionally resume from a checkpoint
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume']:
        if ckp_manager.checkpoint_exists(last=True):
            start_epoch = ckp_manager.restore(restore_last=True, model=model, optimizer=optimizer, train_criterion=train_criterion)
            scheduler.step(start_epoch)
            logger.add_line("Checkpoint loaded: '{}' (epoch {})".format(ckp_manager.last_checkpoint_fn(), start_epoch))
        else:
            logger.add_line("No checkpoint found at '{}'".format(ckp_manager.last_checkpoint_fn()))

    cudnn.benchmark = True

    ############################ TRAIN #########################################
    for epoch in range(start_epoch, end_epoch):
        if epoch in cfg['optimizer']['lr']['milestones']:
            ckp_manager.save(epoch, model=model, train_criterion=train_criterion, optimizer=optimizer, filename='checkpoint-ep{}.pth.tar'.format(epoch))
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)
        train_loader.dataset.shuffle_dataset()

        # Train for one epoch
        logger.add_line('='*30 + ' Epoch {} '.format(epoch) + '='*30)
        logger.add_line('LR: {}'.format(scheduler.get_lr()[0]))
        run_phase('train', train_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter)
        run_phase('test', test_loader, model, optimizer, train_criterion, epoch, args, cfg, logger, tb_writter)
        ckp_manager.save(epoch+1, model=model, optimizer=optimizer, train_criterion=train_criterion)
        scheduler.step()
Esempio n. 3
0
def main_worker(gpu, ngpus, fold, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare folder and logger
    eval_dir, model_cfg, logger = eval_utils.prepare_environment(
        args, cfg, fold)

    # Model
    model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args,
                                                logger)

    # Optimizer
    optimizer, scheduler = main_utils.build_optimizer(model.parameters(),
                                                      cfg['optimizer'], logger)

    # Datasets
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)

    ################################ Train ################################
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if (cfg['resume']
            or args.test_only) and ckp_manager.checkpoint_exists(last=True):
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True)
        logger.add_line("Loaded checkpoint '{}' (epoch {})".format(
            ckp_manager.last_checkpoint_fn(), start_epoch))

    if not cfg['test_only']:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)
        for epoch in range(start_epoch, end_epoch):
            scheduler.step(epoch=epoch)
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                test_loader.sampler.set_epoch(epoch)

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger)
            run_phase('test', test_loader, model, None, epoch, args, cfg,
                      logger)
            ckp_manager.save(model, optimizer, scheduler, epoch)

    ################################ Eval ################################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    cfg['dataset']['test']['clips_per_video'] = 25
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)
    top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None,
                                       end_epoch, args, cfg, logger)
    top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args,
                           cfg, logger)

    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    for ft in top1:
        logger.add_line('')
        logger.add_line('[{}] Clip@1: {:6.2f}'.format(ft, top1[ft]))
        logger.add_line('[{}] Clip@5: {:6.2f}'.format(ft, top5[ft]))
        logger.add_line('[{}] Video@1: {:6.2f}'.format(ft, top1_dense[ft]))
        logger.add_line('[{}] Video@5: {:6.2f}'.format(ft, top5_dense[ft]))
Esempio n. 4
0
def main_worker(gpu, ngpus, args, cfg):
    args.gpu = gpu
    ngpus_per_node = ngpus

    # Setup environment
    args = main_utils.initialize_distributed_backend(
        args, ngpus_per_node)  ### Use other method instead
    logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg)

    # Define model
    model = main_utils.build_model(cfg['model'], logger)
    model, args = main_utils.distribute_model_to_cuda(model, args)

    # Define dataloaders
    train_loader = main_utils.build_dataloaders(
        cfg['dataset'], cfg['num_workers'], args.multiprocessing_distributed,
        logger)

    # Define criterion
    train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger)
    train_criterion = train_criterion.cuda()

    # Define optimizer
    optimizer, scheduler = main_utils.build_optimizer(
        params=list(model.parameters()) + list(train_criterion.parameters()),
        cfg=cfg['optimizer'],
        logger=logger)
    ckp_manager = main_utils.CheckpointManager(
        model_dir, rank=args.rank, dist=args.multiprocessing_distributed)

    # Optionally resume from a checkpoint
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume']:
        if ckp_manager.checkpoint_exists(last=True):
            start_epoch = ckp_manager.restore(restore_last=True,
                                              model=model,
                                              optimizer=optimizer,
                                              train_criterion=train_criterion)
            scheduler.step(start_epoch)
            logger.add_line("Checkpoint loaded: '{}' (epoch {})".format(
                ckp_manager.last_checkpoint_fn(), start_epoch))
        else:
            logger.add_line("No checkpoint found at '{}'".format(
                ckp_manager.last_checkpoint_fn()))

    cudnn.benchmark = True

    ############################ TRAIN #########################################
    test_freq = cfg['test_freq'] if 'test_freq' in cfg else 1
    for epoch in range(start_epoch, end_epoch):
        if (epoch % 10) == 0:
            ckp_manager.save(epoch,
                             model=model,
                             train_criterion=train_criterion,
                             optimizer=optimizer,
                             filename='checkpoint-ep{}.pth.tar'.format(epoch))

        if args.multiprocessing_distributed:
            train_loader.sampler.set_epoch(epoch)

        # Train for one epoch
        logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
        logger.add_line('LR: {}'.format(scheduler.get_lr()))
        run_phase('train', train_loader, model, optimizer, train_criterion,
                  epoch, args, cfg, logger, tb_writter)
        scheduler.step(epoch)

        if ((epoch % test_freq) == 0) or (epoch == end_epoch - 1):
            ckp_manager.save(epoch + 1,
                             model=model,
                             optimizer=optimizer,
                             train_criterion=train_criterion)
Esempio n. 5
0
def main(gpu, ngpus, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare for training
    model_cfg, eval_dir, logger = eval_utils.prepare_environment(args, cfg)
    if 'scratch' not in cfg:
        cfg['scratch'] = False
    if 'ft_all' not in cfg:
        cfg['ft_all'] = False
    model, ckp_manager, ckp = eval_utils.build_model(model_cfg,
                                                     cfg['model'],
                                                     eval_dir,
                                                     args,
                                                     logger,
                                                     return_ckp=True,
                                                     scratch=cfg['scratch'])
    params = list(model.parameters()) if cfg['ft_all'] else model.head_params()

    if cfg['use_transf'] != 'none':
        loss_cfg = yaml.safe_load(open(args.model_cfg))['loss']
        align_criterion = main_utils.build_criterion(loss_cfg,
                                                     logger=logger).cuda(gpu)
        align_criterion.load_state_dict(ckp['train_criterion'])
        if type(align_criterion).__name__ == 'MultiTask':
            align_criterion = align_criterion.losses[0]  # MultiTask
        if cfg['ft_all']:
            params += list(align_criterion.parameters())
    else:
        align_criterion = None

    optimizer, scheduler = main_utils.build_optimizer(params, cfg['optimizer'],
                                                      logger)
    train_loader, test_loader = build_dataloaders(cfg['dataset'],
                                                  cfg['num_workers'],
                                                  args.distributed, logger)

    # Optionally resume from a checkpoint
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if 'resume' in cfg:
        args.resume = cfg['resume']
    if 'test_only' in cfg:
        args.test_only = cfg['test_only']
    if args.resume or args.test_only:
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True,
                                          logger=logger)

    ######################### TRAINING #########################
    if not args.test_only:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)
        for epoch in range(start_epoch, end_epoch):
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                test_loader.sampler.set_epoch(epoch)
            train_loader.dataset.shuffle_dataset()
            test_loader.dataset.shuffle_dataset()

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger, align_criterion)
            top1 = run_phase('test', test_loader, model, None, epoch, args,
                             cfg, logger, align_criterion)
            ckp_manager.save(model,
                             optimizer,
                             scheduler,
                             epoch,
                             criterion=align_criterion,
                             eval_metric=top1)
            scheduler.step()

    ######################### TESTING #########################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    top1 = run_phase('test', test_loader, model, None, end_epoch, args, cfg,
                     logger, align_criterion)

    ######################### LOG RESULTS #########################
    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    logger.add_line('Clip@1: {:6.2f}'.format(top1))
Esempio n. 6
0
def main_worker(gpu, ngpus, fold, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare folder and logger
    eval_dir, model_cfg, logger = eval_utils.prepare_environment(
        args, cfg, fold)

    # Model
    model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args,
                                                logger)

    # Optimizer
    optimizer, scheduler = main_utils.build_optimizer(model.parameters(),
                                                      cfg['optimizer'], logger)

    # Datasets
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)

    ################################ Train ################################
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume'] and ckp_manager.checkpoint_exists(last=True):
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True)
        logger.add_line("Loaded checkpoint '{}' (epoch {})".format(
            ckp_manager.last_checkpoint_fn(), start_epoch))

    if not cfg['test_only']:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)

        # Warmup. Train classifier for a few epochs.
        if start_epoch == 0 and 'warmup_classifier' in cfg[
                'optimizer'] and cfg['optimizer']['warmup_classifier']:
            n_wu_epochs = cfg['optimizer'][
                'warmup_epochs'] if 'warmup_epochs' in cfg['optimizer'] else 5
            cls_opt, _ = main_utils.build_optimizer(
                params=[
                    p for n, p in model.named_parameters()
                    if 'feature_extractor' not in n
                ],
                cfg={
                    'lr': {
                        'base_lr': cfg['optimizer']['lr']['base_lr'],
                        'milestones': [
                            n_wu_epochs,
                        ],
                        'gamma': 1.
                    },
                    'weight_decay': cfg['optimizer']['weight_decay'],
                    'name': cfg['optimizer']['name']
                })
            for epoch in range(n_wu_epochs):
                run_phase('train', train_loader, model, cls_opt, epoch, args,
                          cfg, logger)
                top1, _ = run_phase('test', test_loader, model, None, epoch,
                                    args, cfg, logger)

        # Main training loop
        for epoch in range(start_epoch, end_epoch):
            scheduler.step(epoch=epoch)
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                test_loader.sampler.set_epoch(epoch)

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger)
            top1, _ = run_phase('test', test_loader, model, None, epoch, args,
                                cfg, logger)
            ckp_manager.save(model,
                             optimizer,
                             scheduler,
                             epoch,
                             eval_metric=top1)

    ################################ Eval ################################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    cfg['dataset']['test'][
        'clips_per_video'] = 25  # Evaluate clip-level predictions with 25 clips per video for metric stability
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)
    top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args,
                           cfg, logger)
    top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None,
                                       end_epoch, args, cfg, logger)

    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    logger.add_line('Clip@1: {:6.2f}'.format(top1))
    logger.add_line('Clip@5: {:6.2f}'.format(top5))
    logger.add_line('Video@1: {:6.2f}'.format(top1_dense))
    logger.add_line('Video@5: {:6.2f}'.format(top5_dense))