Exemple #1
0

for epoch in range(start_epoch, args['n_epochs']):

    print('Starting epoch {}'.format(epoch))
    scheduler.step(epoch)

    train_loss = train(epoch)
    val_loss, val_iou = val(epoch)

    print('===> train loss: {:.2f}'.format(train_loss))
    print('===> val loss: {:.2f}, val iou: {:.2f}'.format(val_loss, val_iou))

    logger.add('train', train_loss)
    logger.add('val', val_loss)
    logger.add('iou', val_iou)
    logger.plot(save=args['save'], save_dir=args['save_dir'])

    is_best = val_iou > best_iou
    best_iou = max(val_iou, best_iou)

    if args['save']:
        state = {
            'epoch': epoch,
            'best_iou': best_iou,
            'model_state_dict': model.state_dict(),
            'optim_state_dict': optimizer.state_dict(),
            'logger_data': logger.data
        }
        save_checkpoint(state, is_best)
Exemple #2
0
def main():
    args = parse_args()

    cfg = from_file(args.config)
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    if args.load_from is not None:
        cfg.load_from = args.load_from
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.seed is not None:
        cfg.seed = args.seed
    if args.gpus is not None:
        cfg.gpus = args.gpus
    # set random seeds
    if cfg.seed is not None:
        print('Set random seed to {}'.format(cfg.seed))
        set_random_seed(cfg.seed)

    if not os.path.exists(cfg.work_dir):
        os.makedirs(cfg.work_dir)

    ################ 1 DATA ###################
    print('Training model on {} dataset...'.format(cfg.data['dataset']))
    batch_size = cfg.data['batch_size'] * cfg.gpus
    train_dataset = UCF101Dataset(data_file=cfg.data['train_file'], img_tmpl=cfg.data['train_img_tmp'],
    							clip_len=cfg.data['train_clip_len'], size=cfg.data['size'], mode='train', shuffle=True)
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
    val_dataset = UCF101Dataset(data_file=cfg.data['val_file'], img_tmpl=cfg.data['val_img_tmp'],
    							clip_len=cfg.data['val_clip_len'], size=cfg.data['size'], mode='val', shuffle=False)
    val_dataloader= DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8)

    ################ 2 MODEL ##################
    if cfg.load_from is not None:
        print('Init the model from pretrained weight {}.'.format(cfg.load_from))
        model = S3DG(num_class=cfg.model['num_class'])
        load_pretrained_model(model, pretrained_path=cfg.load_from)

    else:
        print('Init the model from scratch.')
        model = S3DG(num_class=cfg.model['num_class'])

    # MODEL
    # NOTE: train and resume train must have same number of GPU, since the name 'module'
    # nn.parallel
    if cfg.resume_from is not None:
        load_checkpoint_model(model, checkpoint_path=cfg.resume_from)

    if torch.cuda.device_count() > 1:  
        print('use %d gpus' % (torch.cuda.device_count()))
        model = nn.DataParallel(model, device_ids=range(cfg.gpus))
    else:
        print('use 1 gpu')

    print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
    model.to(device)

    # ################### 3 CRITERION and OPTIMIZER #########################
    criterion = nn.CrossEntropyLoss().to(device)  # standard crossentropy loss for classification
    # criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4)
    # set lr scheduler
    if cfg.lr_scheduler is not None:
        if cfg.lr_scheduler['type'] == 'step':
            scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma'])
        elif cfg.lr_scheduler['type'] == 'multistep':
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=cfg.lr_scheduler['step'], gamma=cfg.lr_scheduler['gamma'])
        elif cfg.lr_scheduler['type'] == 'exponent':
            scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=cfg.lr_scheduler['gamma'])
    
    log_path = cfg.work_dir
    # IF RESUME
    if cfg.resume_from is not None:
        checkpoint = torch.load(cfg.resume_from)
        print("Resume training from checkpoint: {}...".format(cfg.resume_from))
        optimizer.load_state_dict(checkpoint['opt_dict'])
        scheduler.load_state_dict(checkpoint['lr_dict'])
        resume_epoch = checkpoint['epoch'] + 1
        logger = Logger(os.path.join(log_path, 'log.txt'), resume=True)
    else:
        print("Training model from start...")
        resume_epoch = 0
        logger = Logger(os.path.join(log_path, 'log.txt'))
        logger.set_names(['Learning Rate', 'Train Loss', 'Val Loss', 'Train Acc.', 'Val Acc.'])

    # tensorboard 
    log_dir = os.path.join(cfg.work_dir, datetime.now().strftime('%b%d_%H-%M-%S'))
    writer = SummaryWriter(log_dir=log_dir)

    ################## 4 BEGIN TRAINING #########################
    num_epochs = cfg.num_epochs
    save_epoch = cfg.interval
    save_dir = cfg.work_dir
    display = cfg.display

    best_acc = 0.0
    best_epoch = 0

    for epoch in tqdm(range(resume_epoch, num_epochs)):
        print('\n----------------- Training -------------------')
        print('Epoch: {}/{}'.format(epoch, num_epochs-1))
        train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, epoch, writer, display)
        if args.validate:
            print('\n----------------- Validation -------------------')
            print('Epoch: {}/{}'.format(epoch, num_epochs-1))
            val_loss, val_acc = validation(val_dataloader, model, criterion, optimizer, epoch, writer, display)
            if val_acc >= best_acc:
                best_acc = val_acc
                best_epoch = epoch
            print("\nThe best validation top1-accuracy: {:.3f}%, the best epoch: {}".format(best_acc,best_epoch))

        # EPOCH
        lr = optimizer.state_dict()['param_groups'][0]['lr']
        if args.validate:
            logger.append([lr, train_loss, val_loss, train_acc, val_acc])
        else:
            logger.append([lr, train_loss, 0.0, train_acc, 0.0]) # no valid
        writer.add_scalar('train/learning_rate', optimizer.state_dict()['param_groups'][0]['lr'], epoch)

        if cfg.lr_scheduler is not None:
            scheduler.step()

        if epoch % save_epoch == 0:
            torch.save({
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'opt_dict': optimizer.state_dict(),
                'lr_dict': scheduler.state_dict()
            }, os.path.join(save_dir, 'epoch-' + str(epoch) + '.pth'))

    writer.close()
    logger.close()
    logger.plot()
    savefig(os.path.join(log_path, 'log.eps'))