Example #1
0
 def print_dict(d, ident=''):
     for k in d:
         if isinstance(d[k], dict):
             logger.add_line("{}{}".format(ident, k))
             print_dict(d[k], ident='  ' + ident)
         else:
             logger.add_line("{}{}: {}".format(ident, k, str(d[k])))
Example #2
0
def prepare_environment(args, cfg, fold):
    if args.distributed:
        while True:
            try:
                dist.init_process_group(
                    backend='nccl',
                    init_method='tcp://localhost:{}'.format(args.port),
                    world_size=args.world_size,
                    rank=args.gpu)
                break
            except RuntimeError:
                args.port = str(int(args.port) + 1)

    model_cfg = yaml.safe_load(open(args.model_cfg))['model']
    eval_dir = '{}/{}/eval-{}/fold-{:02d}'.format(model_cfg['model_dir'],
                                                  model_cfg['name'],
                                                  cfg['benchmark']['name'],
                                                  fold)
    os.makedirs(eval_dir, exist_ok=True)
    yaml.safe_dump(cfg, open('{}/config.yaml'.format(eval_dir), 'w'))

    logger = utils.logger.Logger(quiet=args.quiet,
                                 log_fn='{}/eval.log'.format(eval_dir),
                                 rank=args.gpu)
    if any(['SLURM' in env for env in list(os.environ.keys())]):
        logger.add_line("=" * 30 + "   SLURM   " + "=" * 30)
        for env in os.environ.keys():
            if 'SLURM' in env:
                logger.add_line('{:30}: {}'.format(env, os.environ[env]))
    logger.add_line("=" * 30 + "   Config   " + "=" * 30)

    def print_dict(d, ident=''):
        for k in d:
            if isinstance(d[k], dict):
                logger.add_line("{}{}".format(ident, k))
                print_dict(d[k], ident='  ' + ident)
            else:
                logger.add_line("{}{}: {}".format(ident, k, str(d[k])))

    print_dict(cfg)
    logger.add_line("=" * 30 + "   Model Config   " + "=" * 30)
    print_dict(model_cfg)

    return eval_dir, model_cfg, logger
Example #3
0
def main_worker(gpu, ngpus, fold, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare folder and logger
    eval_dir, model_cfg, logger = eval_utils.prepare_environment(
        args, cfg, fold)

    # Model
    model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args,
                                                logger)

    # Optimizer
    optimizer, scheduler = main_utils.build_optimizer(model.parameters(),
                                                      cfg['optimizer'], logger)

    # Datasets
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)

    ################################ Train ################################
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if (cfg['resume']
            or args.test_only) and ckp_manager.checkpoint_exists(last=True):
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True)
        logger.add_line("Loaded checkpoint '{}' (epoch {})".format(
            ckp_manager.last_checkpoint_fn(), start_epoch))

    if not cfg['test_only']:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)
        for epoch in range(start_epoch, end_epoch):
            scheduler.step(epoch=epoch)
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                test_loader.sampler.set_epoch(epoch)

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger)
            run_phase('test', test_loader, model, None, epoch, args, cfg,
                      logger)
            ckp_manager.save(model, optimizer, scheduler, epoch)

    ################################ Eval ################################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    cfg['dataset']['test']['clips_per_video'] = 25
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)
    top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None,
                                       end_epoch, args, cfg, logger)
    top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args,
                           cfg, logger)

    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    for ft in top1:
        logger.add_line('')
        logger.add_line('[{}] Clip@1: {:6.2f}'.format(ft, top1[ft]))
        logger.add_line('[{}] Clip@5: {:6.2f}'.format(ft, top5[ft]))
        logger.add_line('[{}] Video@1: {:6.2f}'.format(ft, top1_dense[ft]))
        logger.add_line('[{}] Video@5: {:6.2f}'.format(ft, top5_dense[ft]))
Example #4
0
def run_phase(phase, loader, model, optimizer, epoch, args, cfg, logger):
    from utils import metrics_utils
    logger.add_line('\n{}: Epoch {}'.format(phase, epoch))
    feature_names = cfg['model']['args']['feat_names']
    batch_time = metrics_utils.AverageMeter('Time', ':6.3f', 100)
    data_time = metrics_utils.AverageMeter('Data', ':6.3f', 100)
    loss_meters = {
        ft: metrics_utils.AverageMeter('Loss', ':.4e', 0)
        for ft in feature_names
    }
    top1_meters = {
        ft: metrics_utils.AverageMeter('Acc@1', ':6.2f', 0)
        for ft in feature_names
    }
    top5_meters = {
        ft: metrics_utils.AverageMeter('Acc@5', ':6.2f', 0)
        for ft in feature_names
    }
    progress = {
        'timers':
        utils.logger.ProgressMeter(len(loader),
                                   meters=[batch_time, data_time],
                                   phase=phase,
                                   epoch=epoch,
                                   logger=logger)
    }
    progress.update({
        ft: utils.logger.ProgressMeter(
            len(loader),
            meters=[loss_meters[ft], top1_meters[ft], top5_meters[ft]],
            phase=phase,
            epoch=epoch,
            logger=logger)
        for ft in feature_names
    })

    # switch to train/test mode
    model.train(phase == 'train')

    if phase in {'test_dense', 'test'}:
        model = BatchWrapper(model, cfg['dataset']['batch_size'])

    end = time.time()
    criterion = torch.nn.CrossEntropyLoss()
    softmax = torch.nn.Softmax(dim=1)
    for it, sample in enumerate(loader):
        data_time.update(time.time() - end)

        video = sample['frames']
        target = sample['label'].cuda()
        if args.gpu is not None:
            video = video.cuda(args.gpu, non_blocking=True)

        if phase == 'test_dense':
            batch_size, clips_per_sample = video.shape[0], video.shape[1]
            video = video.flatten(0, 1).contiguous()

        # compute outputs
        if phase == 'train':
            logits = model(video)
        else:
            with torch.no_grad():
                logits = model(video)

        # compute loss and measure accuracy
        total_loss = 0.
        for ft in feature_names:
            if phase == 'test_dense':
                confidence = softmax(logits[ft]).view(batch_size,
                                                      clips_per_sample,
                                                      -1).mean(1)
                target_tiled = target.unsqueeze(1).repeat(
                    1, clips_per_sample).view(-1)
                loss = criterion(logits[ft], target_tiled)
            else:
                confidence = softmax(logits[ft])
                loss = criterion(logits[ft], target)
            total_loss += loss

            with torch.no_grad():
                acc1, acc5 = metrics_utils.accuracy(confidence,
                                                    target,
                                                    topk=(1, 5))
                loss_meters[ft].update(loss.item(), target.size(0))
                top1_meters[ft].update(acc1[0].item(), target.size(0))
                top5_meters[ft].update(acc5[0].item(), target.size(0))

        # compute gradient and do SGD step
        if phase == 'train':
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if (it + 1) % 100 == 0 or it == 0 or it + 1 == len(loader):
            for ft in progress:
                progress[ft].display(it + 1)

    if args.distributed:
        for ft in progress:
            progress[ft].synchronize_meters(args.gpu)
            progress[ft].display(len(loader) * args.world_size)

    return {ft: top1_meters[ft].avg
            for ft in feature_names
            }, {ft: top5_meters[ft].avg
                for ft in feature_names}
Example #5
0
def build_model(feat_cfg, eval_cfg, eval_dir, args, logger):
    import models
    pretrained_net = models.__dict__[feat_cfg['arch']](**feat_cfg['args'])

    # Load from checkpoint
    checkpoint_fn = '{}/{}/checkpoint.pth.tar'.format(feat_cfg['model_dir'],
                                                      feat_cfg['name'])
    ckp = torch.load(checkpoint_fn, map_location='cpu')
    pretrained_net.load_state_dict(
        {k.replace('module.', ''): ckp['model'][k]
         for k in ckp['model']})

    # Wrap with linear-head classifiers
    if eval_cfg['model']['name'] == 'ClassificationWrapper':
        model = ClassificationWrapper(
            feature_extractor=pretrained_net.video_model,
            **eval_cfg['model']['args'])
        ckp_manager = CheckpointManager(eval_dir, rank=args.gpu)
    elif eval_cfg['model']['name'] == 'MOSTWrapper':
        model = MOSTModel(feature_extractor=pretrained_net.video_model,
                          **eval_cfg['model']['args'])
        ckp_manager = MOSTCheckpointManager(eval_dir, rank=args.gpu)
    else:
        raise ValueError

    # Log model description
    logger.add_line("=" * 30 + "   Model   " + "=" * 30)
    logger.add_line(str(model))
    logger.add_line("=" * 30 + "   Parameters   " + "=" * 30)
    logger.add_line(main_utils.parameter_description(model))
    logger.add_line("=" * 30 + "   Pretrained model   " + "=" * 30)
    logger.add_line("File: {}\nEpoch: {}".format(checkpoint_fn, ckp['epoch']))

    # Distribute
    model = distribute_model_to_cuda(model, args, eval_cfg)

    return model, ckp_manager
Example #6
0
def build_dataloaders(cfg, fold, num_workers, distributed, logger):
    logger.add_line("=" * 30 + "   Train DB   " + "=" * 30)
    train_loader = build_dataloader(cfg, cfg['train'], fold, num_workers,
                                    distributed)
    logger.add_line(str(train_loader.dataset))

    logger.add_line("=" * 30 + "   Test DB   " + "=" * 30)
    test_loader = build_dataloader(cfg, cfg['test'], fold, num_workers,
                                   distributed)
    logger.add_line(str(test_loader.dataset))

    logger.add_line("=" * 30 + "   Dense DB   " + "=" * 30)
    dense_loader = build_dataloader(cfg, cfg['test_dense'], fold, num_workers,
                                    distributed)
    logger.add_line(str(dense_loader.dataset))

    return train_loader, test_loader, dense_loader
Example #7
0
def main_worker(gpu, ngpus, args, cfg):
    args.gpu = gpu
    ngpus_per_node = ngpus

    # Setup environment
    args = main_utils.initialize_distributed_backend(
        args, ngpus_per_node)  ### Use other method instead
    logger, tb_writter, model_dir = main_utils.prep_environment(args, cfg)

    # Define model
    model = main_utils.build_model(cfg['model'], logger)
    model, args = main_utils.distribute_model_to_cuda(model, args)

    # Define dataloaders
    train_loader = main_utils.build_dataloaders(
        cfg['dataset'], cfg['num_workers'], args.multiprocessing_distributed,
        logger)

    # Define criterion
    train_criterion = main_utils.build_criterion(cfg['loss'], logger=logger)
    train_criterion = train_criterion.cuda()

    # Define optimizer
    optimizer, scheduler = main_utils.build_optimizer(
        params=list(model.parameters()) + list(train_criterion.parameters()),
        cfg=cfg['optimizer'],
        logger=logger)
    ckp_manager = main_utils.CheckpointManager(
        model_dir, rank=args.rank, dist=args.multiprocessing_distributed)

    # Optionally resume from a checkpoint
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume']:
        if ckp_manager.checkpoint_exists(last=True):
            start_epoch = ckp_manager.restore(restore_last=True,
                                              model=model,
                                              optimizer=optimizer,
                                              train_criterion=train_criterion)
            scheduler.step(start_epoch)
            logger.add_line("Checkpoint loaded: '{}' (epoch {})".format(
                ckp_manager.last_checkpoint_fn(), start_epoch))
        else:
            logger.add_line("No checkpoint found at '{}'".format(
                ckp_manager.last_checkpoint_fn()))

    cudnn.benchmark = True

    ############################ TRAIN #########################################
    test_freq = cfg['test_freq'] if 'test_freq' in cfg else 1
    for epoch in range(start_epoch, end_epoch):
        if (epoch % 10) == 0:
            ckp_manager.save(epoch,
                             model=model,
                             train_criterion=train_criterion,
                             optimizer=optimizer,
                             filename='checkpoint-ep{}.pth.tar'.format(epoch))

        if args.multiprocessing_distributed:
            train_loader.sampler.set_epoch(epoch)

        # Train for one epoch
        logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
        logger.add_line('LR: {}'.format(scheduler.get_lr()))
        run_phase('train', train_loader, model, optimizer, train_criterion,
                  epoch, args, cfg, logger, tb_writter)
        scheduler.step(epoch)

        if ((epoch % test_freq) == 0) or (epoch == end_epoch - 1):
            ckp_manager.save(epoch + 1,
                             model=model,
                             optimizer=optimizer,
                             train_criterion=train_criterion)
Example #8
0
def run_phase(phase, loader, model, optimizer, criterion, epoch, args, cfg,
              logger, tb_writter):
    from utils import metrics_utils
    logger.add_line('\n{}: Epoch {}'.format(phase, epoch))
    batch_time = metrics_utils.AverageMeter('Time', ':6.3f', window_size=100)
    data_time = metrics_utils.AverageMeter('Data', ':6.3f', window_size=100)
    loss_meter = metrics_utils.AverageMeter('Loss', ':.3e')
    loss_meter_npid1 = metrics_utils.AverageMeter('Loss_npid1', ':.3e')
    loss_meter_npid2 = metrics_utils.AverageMeter('Loss_npid2', ':.3e')
    loss_meter_cmc1 = metrics_utils.AverageMeter('Loss_cmc1', ':.3e')
    loss_meter_cmc2 = metrics_utils.AverageMeter('Loss_cmc2', ':.3e')
    progress = utils.logger.ProgressMeter(len(loader), [
        batch_time, data_time, loss_meter, loss_meter_npid1, loss_meter_npid2,
        loss_meter_cmc1, loss_meter_cmc2
    ],
                                          phase=phase,
                                          epoch=epoch,
                                          logger=logger,
                                          tb_writter=tb_writter)

    # switch to train mode
    model.train(phase == 'train')

    end = time.time()
    device = args.gpu if args.gpu is not None else 0
    for i, sample in enumerate(loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if phase == 'train':
            embedding = model(sample)
        else:
            with torch.no_grad():
                embedding = model(sample)

        # compute loss
        loss, loss_debug = criterion(embedding)
        loss_meter.update(loss.item(), embedding[0].size(0))
        loss_meter_npid1.update(loss_debug[0].item(), embedding[0].size(0))
        loss_meter_npid2.update(loss_debug[1].item(), embedding[0].size(0))
        loss_meter_cmc1.update(loss_debug[2].item(), embedding[0].size(0))
        loss_meter_cmc2.update(loss_debug[3].item(), embedding[0].size(0))

        # compute gradient and do SGD step during training
        if phase == 'train':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print to terminal and tensorboard
        step = epoch * len(loader) + i
        if (i + 1) % cfg['print_freq'] == 0 or i == 0 or i + 1 == len(loader):
            progress.display(i + 1)

    # Sync metrics across all GPUs and print final averages
    if args.multiprocessing_distributed:
        progress.synchronize_meters(args.gpu)
        progress.display(len(loader) * args.world_size)

    if tb_writter is not None:
        for meter in progress.meters:
            tb_writter.add_scalar('{}-epoch/{}'.format(phase, meter.name),
                                  meter.avg, epoch)
Example #9
0
def run_phase(phase, loader, model, optimizer, criterion, epoch, args, cfg,
              logger, tb_writter):
    from utils import metrics_utils
    logger.add_line('\n{}: Epoch {}'.format(phase, epoch))
    batch_time = metrics_utils.AverageMeter('Time', ':6.3f', window_size=100)
    data_time = metrics_utils.AverageMeter('Data', ':6.3f', window_size=100)
    loss_meter = metrics_utils.AverageMeter('Loss', ':.3e')
    progress = utils.logger.ProgressMeter(len(loader),
                                          [batch_time, data_time, loss_meter],
                                          phase=phase,
                                          epoch=epoch,
                                          logger=logger,
                                          tb_writter=tb_writter)

    # switch to train mode
    model.train(phase == 'train')

    end = time.time()
    device = args.gpu if args.gpu is not None else 0
    for i, sample in enumerate(loader):
        # measure data loading time
        data_time.update(time.time() - end)

        # Prepare batch
        video, audio, index = sample['frames'], sample['audio'], sample[
            'index']
        video = video.cuda(device, non_blocking=True)
        audio = audio.cuda(device, non_blocking=True)
        index = index.cuda(device, non_blocking=True)

        # compute audio and video embeddings
        if phase == 'train':
            video_emb, audio_emb = model(video, audio)
        else:
            with torch.no_grad():
                video_emb, audio_emb = model(video, audio)

        # compute loss
        loss, loss_debug = criterion(video_emb, audio_emb, index)
        loss_meter.update(loss.item(), video.size(0))

        # compute gradient and do SGD step during training
        if phase == 'train':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print to terminal and tensorboard
        step = epoch * len(loader) + i
        if (i + 1) % cfg['print_freq'] == 0 or i == 0 or i + 1 == len(loader):
            progress.display(i + 1)
            if tb_writter is not None:
                for key in loss_debug:
                    tb_writter.add_scalar('{}-batch/{}'.format(phase, key),
                                          loss_debug[key].item(), step)

    # Sync metrics across all GPUs and print final averages
    if args.distributed:
        progress.synchronize_meters(args.gpu)
        progress.display(len(loader) * args.world_size)

    if tb_writter is not None:
        for meter in progress.meters:
            tb_writter.add_scalar('{}-epoch/{}'.format(phase, meter.name),
                                  meter.avg, epoch)
Example #10
0
def main_worker(gpu, ngpus, fold, args, cfg):
    args.gpu = gpu
    args.world_size = ngpus

    # Prepare folder and logger
    eval_dir, model_cfg, logger = eval_utils.prepare_environment(
        args, cfg, fold)

    # Model
    model, ckp_manager = eval_utils.build_model(model_cfg, cfg, eval_dir, args,
                                                logger)

    # Optimizer
    optimizer, scheduler = main_utils.build_optimizer(model.parameters(),
                                                      cfg['optimizer'], logger)

    # Datasets
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)

    ################################ Train ################################
    start_epoch, end_epoch = 0, cfg['optimizer']['num_epochs']
    if cfg['resume'] and ckp_manager.checkpoint_exists(last=True):
        start_epoch = ckp_manager.restore(model,
                                          optimizer,
                                          scheduler,
                                          restore_last=True)
        logger.add_line("Loaded checkpoint '{}' (epoch {})".format(
            ckp_manager.last_checkpoint_fn(), start_epoch))

    if not cfg['test_only']:
        logger.add_line("=" * 30 + "   Training   " + "=" * 30)

        # Warmup. Train classifier for a few epochs.
        if start_epoch == 0 and 'warmup_classifier' in cfg[
                'optimizer'] and cfg['optimizer']['warmup_classifier']:
            n_wu_epochs = cfg['optimizer'][
                'warmup_epochs'] if 'warmup_epochs' in cfg['optimizer'] else 5
            cls_opt, _ = main_utils.build_optimizer(
                params=[
                    p for n, p in model.named_parameters()
                    if 'feature_extractor' not in n
                ],
                cfg={
                    'lr': {
                        'base_lr': cfg['optimizer']['lr']['base_lr'],
                        'milestones': [
                            n_wu_epochs,
                        ],
                        'gamma': 1.
                    },
                    'weight_decay': cfg['optimizer']['weight_decay'],
                    'name': cfg['optimizer']['name']
                })
            for epoch in range(n_wu_epochs):
                run_phase('train', train_loader, model, cls_opt, epoch, args,
                          cfg, logger)
                top1, _ = run_phase('test', test_loader, model, None, epoch,
                                    args, cfg, logger)

        # Main training loop
        for epoch in range(start_epoch, end_epoch):
            scheduler.step(epoch=epoch)
            if args.distributed:
                train_loader.sampler.set_epoch(epoch)
                test_loader.sampler.set_epoch(epoch)

            logger.add_line('=' * 30 + ' Epoch {} '.format(epoch) + '=' * 30)
            logger.add_line('LR: {}'.format(scheduler.get_lr()))
            run_phase('train', train_loader, model, optimizer, epoch, args,
                      cfg, logger)
            top1, _ = run_phase('test', test_loader, model, None, epoch, args,
                                cfg, logger)
            ckp_manager.save(model,
                             optimizer,
                             scheduler,
                             epoch,
                             eval_metric=top1)

    ################################ Eval ################################
    logger.add_line('\n' + '=' * 30 + ' Final evaluation ' + '=' * 30)
    cfg['dataset']['test'][
        'clips_per_video'] = 25  # Evaluate clip-level predictions with 25 clips per video for metric stability
    train_loader, test_loader, dense_loader = eval_utils.build_dataloaders(
        cfg['dataset'], fold, cfg['num_workers'], args.distributed, logger)
    top1, top5 = run_phase('test', test_loader, model, None, end_epoch, args,
                           cfg, logger)
    top1_dense, top5_dense = run_phase('test_dense', dense_loader, model, None,
                                       end_epoch, args, cfg, logger)

    logger.add_line('\n' + '=' * 30 + ' Evaluation done ' + '=' * 30)
    logger.add_line('Clip@1: {:6.2f}'.format(top1))
    logger.add_line('Clip@5: {:6.2f}'.format(top5))
    logger.add_line('Video@1: {:6.2f}'.format(top1_dense))
    logger.add_line('Video@5: {:6.2f}'.format(top5_dense))