Example #1
0
def main():
    global args, best_prec1

    # set run output folder
    model_name = config["model_name"]
    output_dir = config["output_dir"]
    print("=> Output folder for this run -- {}".format(model_name))
    save_dir = os.path.join(output_dir, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(os.path.join(save_dir, 'plots'))

    # adds a handler for Ctrl+C
    def signal_handler(signal, frame):
        """
        Remove the output dir, if you exit with Ctrl+C and
        if there are less then 3 files.
        It prevents the noise of experimental runs.
        """
        num_files = len(glob.glob(save_dir + "/*"))
        if num_files < 1:
            shutil.rmtree(save_dir)
        print('You pressed Ctrl+C!')
        sys.exit(0)
    # assign Ctrl+C signal handler
    signal.signal(signal.SIGINT, signal_handler)

    # create model
    model = ConvColumn(config['num_classes'])

    # multi GPU setting
    if args.use_gpu:
        model = torch.nn.DataParallel(model, device_ids=gpus).to(device)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(config['checkpoint']):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(config['checkpoint'])
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(config['checkpoint'], checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(
                config['checkpoint']))

    transform = Compose([
        CenterCrop(84),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406],
                  std=[0.229, 0.224, 0.225])
    ])

    train_data = VideoFolder(root=config['train_data_folder'],
                             csv_file_input=config['train_data_csv'],
                             csv_file_labels=config['labels_csv'],
                             clip_size=config['clip_size'],
                             nclips=1,
                             step_size=config['step_size'],
                             is_val=False,
                             transform=transform,
                             )

    print(" > Using {} processes for data loader.".format(
        config["num_workers"]))
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config['batch_size'], shuffle=True,
        num_workers=config['num_workers'], pin_memory=True,
        drop_last=True)

    val_data = VideoFolder(root=config['val_data_folder'],
                           csv_file_input=config['val_data_csv'],
                           csv_file_labels=config['labels_csv'],
                           clip_size=config['clip_size'],
                           nclips=1,
                           step_size=config['step_size'],
                           is_val=True,
                           transform=transform,
                           )

    val_loader = torch.utils.data.DataLoader(
        val_data,
        batch_size=config['batch_size'], shuffle=False,
        num_workers=config['num_workers'], pin_memory=True,
        drop_last=False)

    assert len(train_data.classes) == config["num_classes"]

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().to(device)

    # define optimizer
    lr = config["lr"]
    last_lr = config["last_lr"]
    momentum = config['momentum']
    weight_decay = config['weight_decay']
    optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

    if args.eval_only:
        validate(val_loader, model, criterion, train_data.classes_dict)
        return

    # set callbacks
    plotter = PlotLearning(os.path.join(
        save_dir, "plots"), config["num_classes"])
    lr_decayer = MonitorLRDecay(0.6, 3)
    val_loss = 9999999

    # set end condition by num epochs
    num_epochs = int(config["num_epochs"])
    if num_epochs == -1:
        num_epochs = 999999

    print(" > Training is getting started...")
    print(" > Training takes {} epochs.".format(num_epochs))
    start_epoch = args.start_epoch if args.resume else 0

    for epoch in range(start_epoch, num_epochs):
        lr = lr_decayer(val_loss, lr)
        print(" > Current LR : {}".format(lr))

        if lr < last_lr and last_lr > 0:
            print(" > Training is done by reaching the last learning rate {}".
                  format(last_lr))
            sys.exit(1)

        # train for one epoch
        train_loss, train_top1, train_top5 = train(
            train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        val_loss, val_top1, val_top5 = validate(val_loader, model, criterion)

        # plot learning
        plotter_dict = {}
        plotter_dict['loss'] = train_loss
        plotter_dict['val_loss'] = val_loss
        plotter_dict['acc'] = train_top1
        plotter_dict['val_acc'] = val_top1
        plotter_dict['learning_rate'] = lr
        plotter.plot(plotter_dict)

        # remember best prec@1 and save checkpoint
        is_best = val_top1 > best_prec1
        best_prec1 = max(val_top1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': "Conv4Col",
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        }, is_best, config)
def trainEnsemble():
    global args, best_prec1

    # set run output folder
    model_name = "classifier"
    output_dir = config["output_dir"]

    save_dir = os.path.join(output_dir, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        os.makedirs(os.path.join(save_dir, 'plots'))

    # adds a handler for Ctrl+C
    def signal_handler(signal, frame):
        """
        Remove the output dir, if you exit with Ctrl+C and
        if there are less then 3 files.
        It prevents the noise of experimental runs.
        """
        num_files = len(glob.glob(save_dir + "/*"))
        if num_files < 1:
            shutil.rmtree(save_dir)
        print('You pressed Ctrl+C!')
        sys.exit(0)

    # assign Ctrl+C signal handler
    signal.signal(signal.SIGINT, signal_handler)

    # create model
    #model = ConvColumn(config['num_classes'])

    model0 = ConvColumn6(config['num_classes'])
    model0 = torch.nn.DataParallel(model0, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/jester_conv6/checkpoint.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/jester_conv6/checkpoint.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model0.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/jester_conv6/checkpoint.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    model1 = ConvColumn7(config['num_classes'])
    model1 = torch.nn.DataParallel(model1, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/jester_conv7/model_best.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/jester_conv7/model_best.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model1.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/jester_conv7/model_best.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    classifier = Classifier(config['num_classes'])
    classifier = torch.nn.DataParallel(classifier, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/classifier/model_best.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/classifier/model_best.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        classifier.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/classifier/model_best.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    model3 = ConvColumn9(config['num_classes'])
    model3 = torch.nn.DataParallel(model3, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/jester_conv9/model_best.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/jester_conv9/model_best.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model3.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/jester_conv9/model_best.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    model2 = ConvColumn8(config['num_classes'])
    model2 = torch.nn.DataParallel(model2, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/jester_conv8/model_best.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/jester_conv8/model_best.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model2.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/jester_conv8/model_best.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    model4 = ConvColumn5(config['num_classes'])
    model4 = torch.nn.DataParallel(model4, device_ids=gpus).to(device)

    if os.path.isfile("trainings/jpeg_model/ConvColumn5/model_best.pth.tar"):
        print("=> loading checkpoint '{}'".format(args.resume))
        checkpoint = torch.load(
            "trainings/jpeg_model/ConvColumn5/model_best.pth.tar")
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model4.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            "trainings/jpeg_model/ConvColumn5/model_best.pth.tar",
            checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(config['checkpoint']))

    transform_train = Compose([
        RandomAffine(degrees=[-10, 10],
                     translate=[0.15, 0.15],
                     scale=[0.9, 1.1],
                     shear=[-5, 5]),
        CenterCrop(84),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    transform_valid = Compose([
        CenterCrop(84),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_data = VideoFolder(
        root=config['train_data_folder'],
        csv_file_input=config['train_data_csv'],
        csv_file_labels=config['labels_csv'],
        clip_size=config['clip_size'],
        nclips=1,
        step_size=config['step_size'],
        is_val=False,
        transform=transform_train,
    )

    print(" > Using {} processes for data loader.".format(
        config["num_workers"]))
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=config['num_workers'],
        pin_memory=True,
        drop_last=True)

    val_data = VideoFolder(
        root=config['val_data_folder'],
        csv_file_input=config['val_data_csv'],
        csv_file_labels=config['labels_csv'],
        clip_size=config['clip_size'],
        nclips=1,
        step_size=config['step_size'],
        is_val=True,
        transform=transform_valid,
    )

    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=config['batch_size'],
                                             shuffle=False,
                                             num_workers=config['num_workers'],
                                             pin_memory=True,
                                             drop_last=False)

    list_id_files = []
    for i in val_data.csv_data:
        list_id_files.append(i.path[16:])
    print(len(list_id_files))

    ###########

    assert len(train_data.classes) == config["num_classes"]

    # define loss function (criterion) and pptimizer
    criterion = nn.CrossEntropyLoss().to(device)

    # define optimizer
    lr = config["lr"]
    last_lr = config["last_lr"]
    momentum = config['momentum']
    weight_decay = config['weight_decay']
    optimizer = torch.optim.Adam(classifier.parameters(), lr=lr, amsgrad=True)

    #torch.optim.SGD(classifier.parameters(), lr,
    #momentum=momentum,
    #weight_decay=weight_decay)

    # set callbacks
    plotter = PlotLearning(os.path.join(save_dir, "plots"),
                           config["num_classes"])
    lr_decayer = MonitorLRDecay(0.6, 3)
    val_loss = 9999999

    # set end condition by num epochs
    num_epochs = int(config["num_epochs"])
    if num_epochs == -1:
        num_epochs = 999999

    if args.test_only:
        print("test")
        test_data = VideoFolder_test(
            root=config['val_data_folder'],
            csv_file_input=config['test_data_csv'],
            clip_size=config['clip_size'],
            nclips=1,
            step_size=config['step_size'],
            is_val=True,
            transform=transform_valid,
        )

        test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=config['batch_size'],
            shuffle=False,
            num_workers=config['num_workers'],
            pin_memory=True,
            drop_last=False)

        list_id_files_test = []
        for i in test_data.csv_data:
            list_id_files_test.append(i.path[16:])
        print(len(list_id_files_test))
        test_ensemble(test_loader, classifier, model1, model2, model3,
                      list_id_files_test, criterion, train_data.classes_dict)
        return

    if args.eval_only:
        val_loss, val_top1, val_top5 = validate_ensemble(
            val_loader, classifier, model1, model2, model3, list_id_files,
            criterion, train_data.classes_dict)
        return

    # switch to evaluate mode
    model0.eval()
    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    classifier.train()

    logits_matrix = []
    targets_list = []

    new_input = np.array([])
    train_writer = tensorboardX.SummaryWriter("logs")

    for epoch in range(0, num_epochs):
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        lr = lr_decayer(val_loss, lr)
        print(" > Current LR : {}".format(lr))

        if lr < last_lr and last_lr > 0:
            print(" > Training is done by reaching the last learning rate {}".
                  format(last_lr))
            sys.exit(1)
        for i, (input, target) in enumerate(train_loader):
            input, target = input.to(device), target.to(device)

            with torch.no_grad():

                # compute output and loss
                output0, feature0 = model0(input)
                output1, feature1 = model1(input)
                output2, feature2 = model2(input)
                output3, feature3 = model3(input)
                output4, feature4 = model4(input)
                #sav=torch.cat((feature0,feature1,feature2,feature3,feature4),1)
                sav = torch.cat((output0, output1, output2, output3, output4),
                                1)
            classifier.zero_grad()
            class_video = classifier(sav)
            loss = criterion(class_video, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(class_video.detach(),
                                    target.detach().cpu(),
                                    topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % config["print_freq"] == 0:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                          0,
                          i,
                          len(train_loader),
                          loss=losses,
                          top1=top1,
                          top5=top5))

        val_loss, val_top1, val_top5 = validate_ensemble(
            val_loader, classifier, model0, model1, model2, model3, model4,
            list_id_files, criterion)

        train_writer.add_scalar('loss', loss, losses.avg)
        train_writer.add_scalar('top1', top1.avg, epoch + 1)
        train_writer.add_scalar('top5', top5.avg, epoch + 1)

        train_writer.add_scalar('val_loss', val_loss, epoch + 1)
        train_writer.add_scalar('val_top1', val_top1, epoch + 1)
        train_writer.add_scalar('val_top5', val_top5, epoch + 1)

        # remember best prec@1 and save checkpoint
        is_best = val_top1 > best_prec1
        best_prec1 = max(val_top1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': "Classifier",
                'state_dict': classifier.state_dict(),
                'best_prec1': best_prec1,
            }, is_best, config)