Example #1
0
    def __init__(self, root, mode, test_dir, train_dir, save_model_withname=None,\
                 save_error_withname=None, checkpoint=None):
        self.root = root
        self.mode = mode
        self.test_dir = test_dir
        self.train_dir = train_dir
        self.save_model_withname = save_model_withname
        self.save_error_withname = save_error_withname
        self.checkpoint = checkpoint
        self.batch_size = 50
        self.learning_rate = 0.0001
        self.validation_loop = 0

        if(self.mode=='train'):
            self.writer = tensorboardX.SummaryWriter(comment="train")
        else:
            self.writer = tensorboardX.SummaryWriter(comment="test")
        # setup dataset
        self.train_transforms = transforms.Compose([videotransforms.RandomCrop(112),
                                           videotransforms.RandomHorizontalFlip(),])
        self.test_transforms = transforms.Compose([videotransforms.CenterCrop(112)])

        self.dataset = VisualTactile(self.root, self.train_dir, self.train_transforms)
        self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True, num_workers=1, pin_memory=True)

        self.val_dataset = VisualTactile(self.root, self.test_dir, self.test_transforms)
        self.val_dataloader = torch.utils.data.DataLoader(self.val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True)

#         self.dataloaders = {'train': self.dataloader, 'val': self.val_dataloader}
#         self.datasets = {'train': self.dataset, 'val': self.val_dataset}

        self.model, self.optimizer, self.scheduler = self.load_model(self.checkpoint)
def extract_feature(args):

    transform = transforms.Compose([videotransforms.RandomCrop(224)])
    dataset = Dataset(args.train_split, 'val', args.root, args.frame_nb,
                      args.interval, transform)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=24,  # 24 on jobs
        pin_memory=True)

    if args.resnet_nb == 50:
        resnet = torchvision.models.resnet50(pretrained=True)
        print('load resnet50 pretrained model...')
    elif args.resnet_nb == 101:
        resnet = torchvision.models.resnet101(pretrained=True)
        print('load resnet101 pretrained model...')
    elif args.resnet_nb == 152:
        resnet = torchvision.models.resnet152(pretrained=True)
        print('load resnet152 pretrained model...')
    else:
        raise ValueError(
            'resnet_nb should be in [50|101|152] but got {}').format(
                args.resnet_nb)

    i3resnet = I3ResNet(copy.deepcopy(resnet),
                        args.frame_nb,
                        args.class_nb,
                        side_task=False,
                        conv_class=True)
    # print(i3resnet.layer3[0].downsample[1])
    state_dict = torch.load(args.model_path)
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]  # remove 'module'.
        new_state_dict[name] = v

    i3resnet.load_state_dict(new_state_dict)
    print('loaded saved state_dict...')

    i3resnet.eval()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    i3resnet = i3resnet.to(device)

    with torch.no_grad():
        hook = Hook(i3resnet.layer3)
        print('registered Hook...')
        for i, data in enumerate(dataloader):
            vid, img_cpu, action_cpu, reson_cpu = data
            img = Variable(img_cpu.to(device))
            action = Variable(action_cpu.to(device))
            pred = i3resnet(img)
            feature = hook.output.cpu().data.numpy()
            feature = np.squeeze(feature)
            np.save((args.save_path + '{}.npy'.format(vid[0])), feature)

            print('Saved feature numbers:', i + 1)
    print('finished extracting features')
Example #3
0
def load_data(dataset_path, batch_size=5, num_workers=10):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset_train = VidorDataset(dataset_path, 'training', train_transforms)
    cls_weights = dataset_train.get_weights()
    dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=num_workers,
                                                   pin_memory=True)
    dataset_val = VidorDataset(dataset_path, 'validation', test_transforms)
    dataloader_val = torch.utils.data.DataLoader(dataset_val,
                                                 batch_size=1,
                                                 shuffle=True,
                                                 num_workers=num_workers,
                                                 pin_memory=True)
    dataloaders = {'train': dataloader_train, 'val': dataloader_val}
    datasets = {'train': dataset_train, 'val': dataset_val}
    return datasets, dataloaders, np.asarray(1 - cls_weights, dtype=np.float32)
Example #4
0
def train(num_epoch=100, root='/home/selfdriving/mrcnn/bdd12k/', \
        train_split='/home/selfdriving/I3D/data/bdd12k.json', batch_size=4, save_model='models/', \
        frame_nb=64,class_nb=7, resnet_nb=50):
    # setup dataset

    transform = transforms.Compose([
        videotransforms.RandomCrop(224)
    ])

    dataset = Dataset(train_split, 'train', root, transform)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=16,
                                             pin_memory=True)
    
    if args.val:
        val_dataset = Dataset(train_split, 'val', root, transforms)
        val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    num_workers=16,
                                                    pin_memory=True)

    # dataloaders = {'train': dataloader, 'val': val_dataloader}
    # datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    if args.resnet_nb == 50:
        resnet = torchvision.models.resnet50(pretrained=True)
    elif args.resnet_nb == 101:
        resnet = torchvision.models.resnet101(pretrained=True)
    elif args.resnet_nb == 152:
        resnet = torchvision.models.resnet152(pretrained=True)
    else:
        raise ValueError('resnet_nb should be in [50|101|152] but got {}'
                         ).format(args.resnet_nb)

    i3resnet = I3ResNet(copy.deepcopy(resnet), args.frame_nb, args.class_nb, conv_class=True)
   
    
    # set CPU/GPU devices
    i3resnet.train()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    i3resnet = i3resnet.to(device)
    i3resnet = nn.DataParallel(i3resnet) #multiple GPUs

    class_weights = [0.4,2,2,2,2,2,1]
    w = torch.FloatTensor(class_weights).cuda()
    criterion = nn.BCEWithLogitsLoss(pos_weight=w).cuda()
    optimizer = optim.Adam(i3resnet.parameters(), lr=0.0001, weight_decay=0.001)


    # train it
    for epoch in range(0, num_epoch):
        print('Epoch {}/{}'.format(epoch, num_epoch))
        print('-' * 10)

        lossArr = []
        AccuracyArr = []

            # Iterate over data.
        for i, data in enumerate(dataloader):
            tic = time.time()
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs = Variable(inputs.to(device)) #4x3x64x224x224
            labels = Variable(labels.to(device)) #4x7

            optimizer.zero_grad()
            pred = i3resnet(inputs) #4x7

            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
            loss_cpu = np.array(loss.cpu().data.item())

            lossArr.append(loss_cpu)
            meanLoss = np.mean(np.array(lossArr))

            # Calculate accuracy
            predict = torch.sigmoid(pred) >= 0.5
            f1 = f1_score(labels.cpu().data.numpy(), predict.cpu().data.numpy(), average='samples')
            AccuracyArr.append(f1)


            if i % 10 == 0:
                toc = time.time()
                print('time elapsed', toc - tic)
                #print('prediction:', pred)
                print('prediction logits:{}'.format(predict))

                print('ground truth:{}'.format(labels.cpu().data.numpy()))
                print('Epoch %d Iteration %d: Loss %.5f Accumulated Loss %.5f' % (
                    epoch, i, lossArr[-1], meanLoss))
                print('Epoch %d Iteration %d: F1 %.5f Accumulated F1 %.5f' % (
                    epoch, i, AccuracyArr[-1], np.mean(np.array(AccuracyArr))))

            # if epoch in [int(0.5*num_epoch), int(0.7*num_epoch)] and i==0:
            #     print('The learning rate is being decreased at Iteration %d', i)
            #     for param_group in optimizer.param_groups:
            #         param_group['lr'] /= 10

        # if i >= args.MaxIteration:
        #     break

        if (epoch + 1) % 5 == 0:
            torch.save(i3resnet.state_dict(), (save_model + 'net_%d.pth' % (epoch + 1)))
        if args.val and (epoch + 1)% 1 == 0:
            print("Validation...")
            run_test(val_dataloader, i3resnet, device)

    torch.save(i3resnet.state_dict(), (save_model + 'net_Final.pth'))          
    PIN_MEMORY = True
    
    print('LR =', LR)
    print('BATCH_SIZE =', BATCH_SIZE)
    print('CLIP_SIZE =', CLIP_SIZE)
    print('EPOCHS =', EPOCHS)
    print('SAVE_DIR =', SAVE_DIR)

    # Book-keeping
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    with open(SAVE_DIR + 'info.txt', 'w+') as f:
        f.write('LR = {}\nBATCH_SIZE = {}\nEPOCHS = {}\n'.format(LR, BATCH_SIZE, EPOCHS))

    # Transforms
    train_transforms = transforms.Compose([videotransforms.RandomCrop(224),
                                            videotransforms.RandomHorizontalFlip(),
                                            ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    # Datasets and Dataloaders
    train_dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True)

    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True)    

    dataloaders = {'train': train_dataloader, 'val': val_dataloader}

    # Load pre-trained I3D model
    i3d = InceptionI3d(400, in_channels=3) # pre-trained model has 400 output classes
Example #6
0
def run(init_lr=0.1,
        max_steps=1e8,
        mode='rgb',
        dataset='thumos',
        root_train='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/',
        root_eval='/mnt/data_a/alex/PyTorch_I3D/thumos/test/',
        train_split='/mnt/data_a/alex/PyTorch_I3D/thumos/validation/validation_thumos.json',
        eval_split='/mnt/data_a/alex/PyTorch_I3D/thumos/test/test_thumos.json',
        batch_size=4,
        batch_size_eval=4,
        save_model='',
        snippets=64,
        saving_steps=5000,
        num_steps_per_update=1,
        num_classes=65,
        crf=False,
        num_updates_crf=1,
        reg_crf=-1,
        use_cls=False,
        pairwise_cond_crf=False,
        reg_type='l2'):

    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root_train, mode, snippets,
                      train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=8,
                                             pin_memory=True,
                                             drop_last=True)

    val_dataset = Dataset(eval_split, 'testing', root_eval, mode, snippets,
                          test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size_eval,
                                                 shuffle=True,
                                                 num_workers=8,
                                                 pin_memory=True,
                                                 drop_last=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup model
    steps = 0
    epoch = 0
    if not os.path.exists(args.save_model):
        subprocess.call('mkdir ' + args.save_model, shell=True)
    configure(args.save_model + "tensorboard_logger", flush_secs=5)

    # resume the training or load the pre-trained I3D
    checkpoint = -1
    try:
        checkpoint = last_checkpoint(args.save_model)
    except:
        print("Loading the pre-trained I3D")
        if mode == 'flow':
            i3d = InceptionI3d(400,
                               in_channels=2,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)
            total_dict = i3d.state_dict()
            partial_dict = torch.load('models/flow_imagenet.pt')
            total_dict.update(partial_dict)
            i3d.load_state_dict(total_dict)

        else:
            i3d = InceptionI3d(400,
                               in_channels=3,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)
            total_dict = i3d.state_dict()
            partial_dict = torch.load('models/rgb_imagenet.pt')
            total_dict.update(partial_dict)
            i3d.load_state_dict(total_dict)

        i3d.replace_logits(num_classes)

    if (checkpoint != -1):
        if mode == 'flow':
            i3d = InceptionI3d(num_classes,
                               in_channels=2,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)

        else:
            i3d = InceptionI3d(num_classes,
                               in_channels=3,
                               use_crf=crf,
                               num_updates_crf=num_updates_crf,
                               pairwise_cond_crf=pairwise_cond_crf)

        i3d.load_state_dict(torch.load(args.save_model + checkpoint))
        steps = int(checkpoint[:-3])
        if dataset == 'thumos':
            epoch = int(steps * snippets * batch_size * num_steps_per_update /
                        1214016)
        else:
            epoch = int(steps * snippets * batch_size * num_steps_per_update /
                        5482688)

    # push the pipeline on multiple gpus if possible
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    # setup optimizer
    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                              milestones=[1000],
                                              gamma=0.1)
    if steps > 0:
        for i in range(steps):
            lr_sched.step()

    # train the model
    while steps < max_steps:
        epoch += 1
        print('-' * 10)
        print('Epoch {}'.format(epoch))
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                print('Entering training loop...')
                i3d.train()
            else:
                print('Entering validation loop...')
                i3d.eval()
                time_init_eval = time.time()

            cumul_pred = Cumulator(num_classes)
            cumul_labels = Cumulator(num_classes)

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            tot_loss_updt = 0.0
            tot_loc_loss_updt = 0.0
            tot_cls_loss_updt = 0.0
            tot_reg_loss_updt = 0.0
            num_iter = 0
            optimizer.zero_grad()
            count_batch = 0
            gap_train = 0

            print("Losses initialized to 0")

            # Iterate over data.
            for data in dataloaders[phase]:
                time_init_batch = time.time()
                count_batch += 1
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                # forward
                if crf:
                    per_frame_logits_ante_crf, per_frame_logits = i3d(inputs)
                else:
                    per_frame_logits = i3d(inputs)

                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')
                if crf:
                    per_frame_logits_ante_crf = F.upsample(
                        per_frame_logits_ante_crf, t, mode='linear')

                # accumulate predictions and ground truths
                pred_np = pt_var_to_numpy(nn.Sigmoid()(per_frame_logits))
                cumul_pred.append(pred_np)
                labels_np = pt_var_to_numpy(labels)
                cumul_labels.append(labels_np)

                # compute localization loss
                if crf:
                    loc_loss = F.binary_cross_entropy_with_logits(
                        per_frame_logits,
                        labels) + F.binary_cross_entropy_with_logits(
                            per_frame_logits_ante_crf, labels)
                else:
                    loc_loss = F.binary_cross_entropy_with_logits(
                        per_frame_logits, labels)
                tot_loc_loss += loc_loss.data[0]
                tot_loc_loss_updt += loc_loss.data[0]

                # compute classification loss (with max-pooling along time B x C x T)
                if crf:
                    cls_loss = F.binary_cross_entropy_with_logits(
                        torch.max(per_frame_logits, dim=2)[0],
                        torch.max(
                            labels,
                            dim=2)[0]) + F.binary_cross_entropy_with_logits(
                                torch.max(per_frame_logits_ante_crf, dim=2)[0],
                                torch.max(labels, dim=2)[0])
                else:
                    cls_loss = F.binary_cross_entropy_with_logits(
                        torch.max(per_frame_logits, dim=2)[0],
                        torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data[0]
                tot_cls_loss_updt += cls_loss.data[0]

                # compute regularization loss for the crf module
                if crf and (reg_crf > 0 and not pairwise_cond_crf):
                    reg_loss = get_reg_loss(i3d, 'crf', reg_type)
                    tot_reg_loss_updt += reg_loss.data[0]
                elif crf and (reg_crf > 0 and pairwise_cond_crf):
                    reg_loss = get_reg_loss(i3d, 'psi_0',
                                            reg_type) + get_reg_loss(
                                                i3d, 'psi_1', reg_type)
                    tot_reg_loss_updt += reg_crf * reg_loss.data[0]
                else:
                    reg_loss = 0

                # put all the losses together
                if use_cls:
                    loss = (0.5 * loc_loss + 0.5 * cls_loss +
                            reg_crf * reg_loss) / num_steps_per_update
                else:
                    loss = (loc_loss +
                            reg_crf * reg_loss) / num_steps_per_update

                tot_loss += loss.data[0]
                tot_loss_updt += loss.data[0]
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    examples_processed_updt = num_steps_per_update * batch_size * snippets
                    examples_processed_tot = count_batch * batch_size * snippets
                    map_train = map_calculator(cumul_pred.accumuled[1:],
                                               cumul_labels.accumuled[1:])
                    gap_train = ap_calculator(
                        cumul_pred.accumuled[1:].flatten(),
                        cumul_labels.accumuled[1:].flatten())
                    print(
                        'TRAINING - Epoch: {} Step: {} Examples processed {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} Reg Loss: {:.6f} mAP: {:.6f} GAP: {:.6f}'
                        .format(
                            epoch, steps, examples_processed_tot,
                            tot_loc_loss_updt / examples_processed_updt,
                            tot_cls_loss_updt / examples_processed_updt,
                            tot_loss_updt / (batch_size * snippets), reg_crf *
                            tot_reg_loss_updt / examples_processed_updt,
                            map_train, gap_train))
                    log_value('Training_loc_loss',
                              tot_loc_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_cls_loss',
                              tot_cls_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_reg_loss',
                              tot_reg_loss_updt / examples_processed_updt,
                              steps)
                    log_value('Training_tot_loss',
                              tot_loss_updt / (batch_size * snippets), steps)
                    log_value('Training_mAP', map_train, steps)
                    log_value('Training_GAP', gap_train, steps)
                    tot_loss_updt, tot_loc_loss_updt, tot_cls_loss_updt, tot_reg_loss_updt = 0.0, 0.0, 0.0, 0.0
                    cumul_pred.clear()
                    cumul_labels.clear()
                    cumul_pred = Cumulator(num_classes)
                    cumul_labels = Cumulator(num_classes)

                if ((steps % saving_steps)
                        == 0) & (phase == 'train') & (num_iter == 0):
                    # save model
                    print("EPOCH: {} Step: {} - Saving model...".format(
                        epoch, steps))
                    torch.save(i3d.module.state_dict(),
                               save_model + str(steps).zfill(6) + '.pt')
                    tot_loss = tot_loc_loss = tot_cls_loss = 0.

                if phase == 'val':
                    time_end_batch = time.time()
                    examples_processed_tot = count_batch * batch_size_eval * snippets
                    print(
                        'EVAL - Epoch: {} Step: {} Examples processed {} - Time for batch: {}'
                        .format(epoch, steps, examples_processed_tot,
                                time_end_batch - time_init_batch))
                    log_value('Evaluation time',
                              time_end_batch - time_init_batch,
                              examples_processed_tot)

            if phase == 'val':
                examples_processed_tot = count_batch * batch_size_eval * snippets
                map_val = map_calculator(cumul_pred.accumuled[1:],
                                         cumul_labels.accumuled[1:])
                gap_val = ap_calculator(cumul_pred.accumuled[1:].flatten(),
                                        cumul_labels.accumuled[1:].flatten())
                time_end_eval = time.time()
                print(
                    'EVAL - Epoch: {} Step: {} Loc Loss: {:.6f} Cls Loss: {:.6f} Tot Loss: {:.6f} mAP: {:.4f} GAP: {:.4f} Total time: {}'
                    .format(
                        epoch, steps, tot_loc_loss / examples_processed_tot,
                        tot_cls_loss / examples_processed_tot, tot_loss_updt *
                        num_steps_per_update / examples_processed_tot, map_val,
                        gap_val, time_end_eval - time_init_eval))
                log_value('Validation_subset_loc_loss',
                          tot_loc_loss / examples_processed_tot, steps)
                log_value('Validation_subset_cls_loss',
                          tot_cls_loss / examples_processed_tot, steps)
                log_value(
                    'Validation_subset_tot_loss', tot_loss_updt *
                    num_steps_per_update / examples_processed_tot)
                log_value('Validation_subset_mAP', map_val, steps)
                log_value('Validation_subset_GAP', gap_val, steps)
                cumul_pred.clear()
                cumul_labels.clear()
def train():
    args = get_parse()
    cabin_video_dir = args.cabin_video_dir
    face_video_dir = args.face_video_dir
    train_data_path = args.train_data_path
    val_data_path = args.val_data_path
    train_batch_size = args.train_batch_size
    val_batch_size = args.val_batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    weight_decay = args.weight_decay
    display_steps = args.display_steps
    ckp_dir = args.ckp_dir
    save_path = args.save_path
    num_classes = args.num_classes
    weight = args.weight

    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    if not os.path.exists(ckp_dir):
        os.makedirs(ckp_dir)

    print('Start to load data')
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    val_transforms = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToTensor(),
        videotransforms.ClipNormalize()
    ])
    train_dataset = IVBSSDataset(cabin_video_dir, face_video_dir,
                                 train_data_path, train_transforms)
    val_dataset = IVBSSDataset(cabin_video_dir, face_video_dir, val_data_path,
                               val_transforms)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=train_batch_size,
                                  sampler=RandomSampler(train_dataset,
                                                        replacement=True),
                                  collate_fn=collate_fn,
                                  drop_last=True)
    total_steps = num_epochs * len(train_dataloader)
    print('Total number of training samples is {0}'.format(len(train_dataset)))
    print('Total number of validation samples is {0}'.format(len(val_dataset)))
    print('Total number of training steps is {0}'.format(total_steps))

    model = TAL_Net(num_classes)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate,
                                momentum=0.9,
                                weight_decay=weight_decay)
    #     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10,
                                                gamma=0.1)
    start_epoch = 0

    if args.pretrained_I3D_model is not None:
        print('Load pretrained I3D model')
        pretrained_I3D_model = torch.load(args.pretrained_I3D_model)
        model.I3D_1.load_state_dict(pretrained_I3D_model)
        model.I3D_2.load_state_dict(pretrained_I3D_model)

    if args.ckp_path is not None:
        print('Load checkpoint')
        start_epoch, model, optimizer, scheduler = load_ckp(
            args.ckp_path, model, optimizer, scheduler)

    model.to(device)
    model.train()

    print('Start to train')
    num_step = 0
    best_acc = 0.0
    for epoch in range(start_epoch, num_epochs):
        running_loss = 0.0
        class_running_loss = 0.0
        chunk_inclusion_running_loss = 0.0
        for i, (cabin_imgs, face_imgs, labels, start_labels,
                end_labels) in enumerate(train_dataloader):
            cabin_imgs = cabin_imgs.to(device)
            face_imgs = face_imgs.to(device)
            labels = labels.to(device)
            start_labels = start_labels.to(device)
            end_labels = end_labels.to(device)
            optimizer.zero_grad()
            loss, class_loss, chunk_inclusion_loss = model(
                cabin_imgs, face_imgs, labels, start_labels, end_labels,
                weight)[:3]
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            class_running_loss += class_loss.item()
            chunk_inclusion_running_loss += chunk_inclusion_loss.item()
            if (i + 1) % display_steps == 0:
                print(
                    'epoch:{0}/{1}, step:{2}/{3}, loss:{4:.4f}, class_loss:{5:.4f}, chunk_inclusion_loss:{6:.4f}'
                    .format(epoch + 1, num_epochs, i + 1,
                            len(train_dataloader),
                            running_loss / display_steps,
                            class_running_loss / display_steps,
                            chunk_inclusion_running_loss / display_steps))
                running_loss = 0.0
                class_running_loss = 0.0
                chunk_inclusion_running_loss = 0.0
            num_step += 1
            writer.add_scalars(
                'Loss/train', {
                    'total_loss': loss,
                    'class_loss': class_loss,
                    'chunk_inclusion_loss': chunk_inclusion_loss
                }, num_step)

        scheduler.step()

        print('Start to validate')
        #         eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval(train_dataset, train_batch_size, model, weight, device)
        eval_loss, eval_class_loss, eval_chunk_inclusion_loss, class_accuracy = eval(
            val_dataset, val_batch_size, model, weight, device)
        writer.add_scalars(
            'Loss/valid', {
                'total_loss': eval_loss,
                'class_loss': eval_class_loss,
                'chunk_inclusion_loss': eval_chunk_inclusion_loss
            }, epoch)
        writer.add_scalar('Accuracy/valid', class_accuracy, epoch)

        print(
            'Toal loss on validation dataset: {0:.4f}, class loss on validation dataset: {1:.4f}, chunk inclusion loss on validation dataset: {2:.4f}, class accuracy on validation dataset: {3:.4f}'
            .format(eval_loss, eval_class_loss, eval_chunk_inclusion_loss,
                    class_accuracy))

        is_best = class_accuracy > best_acc
        best_acc = max(class_accuracy, best_acc)

        checkpoint = {
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
        }

        ckp_name = 'epoch_' + str(epoch + 1) + '.pt'
        save_ckp(checkpoint, ckp_dir, ckp_name, is_best, save_path)
        print('Save the checkpoint after {} epochs'.format(epoch + 1))

    writer.close()
Example #8
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='../../SSBD/ssbd_clip_segment/data/',
        train_split='../../SSBD/Annotations/annotations_charades.json',
        batch_size=1,
        save_model=''):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=4,
                                             pin_memory=True)

    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=4,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # dataloaders = {'train': dataloader}
    # datasets = {'train': dataset}

    # setup the model

    xdc = torch.hub.load('HumamAlwassel/XDC',
                         'xdc_video_encoder',
                         pretraining='r2plus1d_18_xdc_ig65m_kinetics',
                         num_classes=3)
    # if mode == 'flow':
    #     i3d = InceptionI3d(400, in_channels=2)
    #     i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    # else:
    #     i3d = InceptionI3d(400, in_channels=3)
    #     i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    # i3d.replace_logits(8)
    # #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    # i3d.cuda()
    # i3d = nn.DataParallel(i3d)
    xdc.cuda()
    xdc = nn.DataParallel(xdc).cuda()

    for name, param in xdc.named_parameters():
        if 'fc' not in name and '4.1' not in name:
            param.requires_grad = False

    lr = init_lr
    optimizer = optim.SGD(xdc.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    best_val = 0
    # new_flag = 0
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)
        # new_state_dict = OrderedDict()
        # state_dict = torch.load(save_model+'.pt')
        # for k, v in state_dict.items():
        #     name = "module."+k # add module.
        #     new_state_dict[name] = v
        # xdc.load_state_dict(new_state_dict)
        # new_flag = 0
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                xdc.train(True)
            else:
                xdc.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            # tot_loc_loss = 0.0
            # tot_cls_loss = 0.0
            num_iter = 0
            total = 0
            n = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = xdc(inputs)
                # print(per_frame_logits.shape)
                # print(labels.shape)
                # upsample to input size
                # per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                # compute localization loss
                # loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
                # tot_loc_loss += loc_loss.data.item()

                # compute classification loss (with max-pooling along time B x C x T)
                # cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
                # print(torch.max(per_frame_logits, dim=2)[0])
                # print(torch.max(labels, dim=2)[0])
                correct = per_frame_logits.argmax(1).eq(labels.argmax(1))
                total += correct.float().sum().item()
                n += batch_size
                # tot_cls_loss += cls_loss.data.item()

                loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels) / num_steps_per_update
                tot_loss += loss.data.item()
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format(
                            phase, tot_loss / 10, total / n))
                        # save model
                        # if(steps % 10000 == 0):
                        # torch.save(xdc.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
                        # tot_loss = tot_loc_loss = tot_cls_loss = 0.
                        tot_loss = 0
                        total = 0
                        n = 0
            if phase == 'val':
                print('{} Tot Loss: {:.4f} Accuracy: {:.4f}'.format(
                    phase, (tot_loss * num_steps_per_update) / num_iter,
                    total / n))
                if (total / n > best_val):
                    best_val = total / n
                    torch.save(xdc.module.state_dict(), save_model + '.pt')
def run(
        dataset_path,
        annotation_path,
        init_lr,
        frames_per_clip,
        mode,
        logdir,
        frame_skip,
        batch_size,
        refine,
        refine_epoch,
        pretrained_model,
        max_steps,
):
    os.makedirs(logdir, exist_ok=True)

    # setup dataset
    train_transforms = transforms.Compose(
        [
            videotransforms.RandomCrop(224),
            videotransforms.RandomHorizontalFlip(),
        ]
    )
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    train_dataset = Dataset(
        dataset_path=dataset_path,
        annotation_path=annotation_path,
        transform=train_transforms,
        index_filename="train_dataset_index.txt",
        frame_skip=frame_skip,
        frames_per_clip=frames_per_clip,
    )

    print("Number of clips in the train dataset:{}".format(len(train_dataset)))

    test_dataset = Dataset(
        dataset_path=dataset_path,
        annotation_path=annotation_path,
        transform=test_transforms,
        index_filename="test_dataset_index.txt",
        frame_skip=frame_skip,
        frames_per_clip=frames_per_clip,
    )

    print("Number of clips in the test dataset:{}".format(len(test_dataset)))

    weights = utils.make_weights_for_balanced_classes(train_dataset.clip_list, train_dataset.clip_label_count)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=3,
        pin_memory=True
    )

    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=3,
        pin_memory=True
    )

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('models/flow_' + pretrained_model + '.pt'))
    else:
        i3d = InceptionI3d(157, in_channels=3)
        i3d.load_state_dict(torch.load('models/rgb_' + pretrained_model + '.pt'))

    num_classes = len(train_dataset.action_name_list)
    i3d.replace_logits(num_classes)

    for name, param in i3d.named_parameters():  # freeze i3d parameters
        if 'logits' in name:
            param.requires_grad = True
        elif 'Mixed_5c' in name:
            param.requires_grad = True
        else:
            param.requires_grad = False

    if refine:
        if refine_epoch == 0:
            raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.")
        refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6) + '.pt')
        checkpoint = torch.load(refine_model_filename)
        i3d.load_state_dict(checkpoint["model_state_dict"])

    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = init_lr

    optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=1E-6)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [15, 30, 45, 60])

    if refine:
        lr_sched.load_state_dict(checkpoint["lr_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    train_writer = SummaryWriter(os.path.join(logdir, 'train'))
    test_writer = SummaryWriter(os.path.join(logdir, 'test'))

    num_steps_per_update = 4 * 5  # accum gradient - try to have number of examples per update match original code 8*5*4
    # eval_steps  = 5
    steps = 0
    # train it
    n_examples = 0
    train_num_batch = len(train_dataloader)
    test_num_batch = len(test_dataloader)
    refine_flag = True

    while steps <= max_steps:  # for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)
        if steps <= refine_epoch and refine and refine_flag:
            lr_sched.step()
            steps += 1
            n_examples += len(train_dataset.clip_list)
            continue
        else:
            refine_flag = False
        # Each epoch has a training and validation phase

        test_batchind = -1
        test_fraction_done = 0.0
        test_enum = enumerate(test_dataloader)
        tot_loss = 0.0
        tot_loc_loss = 0.0
        tot_cls_loss = 0.0
        num_iter = 0
        optimizer.zero_grad()

        # Iterate over data.
        avg_acc = []
        for train_batchind, data in enumerate(train_dataloader):

            num_iter += 1
            # get the inputs
            inputs, labels, vid_idx, frame_pad = data

            # wrap them in Variable
            inputs = Variable(inputs.cuda(), requires_grad=True)
            labels = Variable(labels.cuda())

            t = inputs.size(2)
            per_frame_logits = i3d(inputs)
            per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True)

            # compute localization loss
            loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
            tot_loc_loss += loc_loss.item()

            # compute classification loss (with max-pooling along time B x C x T)
            cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0],
                                                          torch.max(labels, dim=2)[0])
            tot_cls_loss += cls_loss.item()

            loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update

            tot_loss += loss.item()
            loss.backward()

            acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1))
            # acc = utils.accuracy(per_frame_logits, labels)
            avg_acc.append(acc.item())
            train_fraction_done = (train_batchind + 1) / train_num_batch
            print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind,
                                                                      len(train_dataloader)))
            if num_iter == num_steps_per_update or train_batchind == len(train_dataloader) - 1:
                n_steps = num_steps_per_update
                if train_batchind == len(train_dataloader) - 1:
                    n_steps = num_iter
                n_examples += batch_size * n_steps
                print('updating the model...')
                print('train Total Loss: {:.4f}'.format(tot_loss / n_steps))
                optimizer.step()
                optimizer.zero_grad()
                train_writer.add_scalar('loss', tot_loss / n_steps, n_examples)
                train_writer.add_scalar('cls loss', tot_cls_loss / n_steps, n_examples)
                train_writer.add_scalar('loc loss', tot_loc_loss / n_steps, n_examples)
                train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples)
                train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples)
                num_iter = 0
                tot_loss = 0.

            if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch:
                i3d.train(False)  # Set model to evaluate mode
                test_batchind, data = next(test_enum)
                inputs, labels, vid_idx, frame_pad = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda(), requires_grad=True)
                labels = Variable(labels.cuda())

                with torch.no_grad():
                    per_frame_logits = i3d(inputs)
                    per_frame_logits = F.interpolate(per_frame_logits, t, mode='linear', align_corners=True)

                    # compute localization loss
                    loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)

                    # compute classification loss (with max-pooling along time B x C x T)
                    cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0],
                                                                  torch.max(labels, dim=2)[0])

                    loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                    acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), torch.argmax(labels, dim=1))

                print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind,
                                                                         len(test_dataloader)))
                test_writer.add_scalar('loss', loss.item(), n_examples)
                test_writer.add_scalar('cls loss', loc_loss.item(), n_examples)
                test_writer.add_scalar('loc loss', cls_loss.item(), n_examples)
                test_writer.add_scalar('Accuracy', acc.item(), n_examples)
                test_fraction_done = (test_batchind + 1) / test_num_batch
                i3d.train(True)
        if steps % 2 == 0:
            # save model
            torch.save({"model_state_dict": i3d.module.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "lr_state_dict": lr_sched.state_dict()},
                       logdir + str(steps).zfill(6) + '.pt')
        steps += 1
        lr_sched.step()
    train_writer.close()
    test_writer.close()
Example #10
0
def train(init_lr, max_steps, mode, root_folder, train_split, batch_size, load_model, save_model):

    train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip()])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root_folder, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)
    val_dataset = Dataset(train_split, 'testing', root_folder, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)
    dataloaders = {'train': dataloader, 'val': val_dataloader}

    i3d = InceptionI3d(400, in_channels=2 if mode == 'flow' else 3)  # setup the model
    i3d.load_state_dict(torch.load('models/{}_imagenet.pt'.format(mode)))
    i3d.replace_logits(157)
    if load_model:
        i3d.load_state_dict(torch.load(load_model))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    steps = 0
    num_steps_per_update = 4  # accum gradient
    while steps < max_steps:  # train it

        print('Step {:6d} / {}'.format(steps, max_steps))
        print('-' * 10)

        for phase in ['train', 'val']:  # each epoch has a training and validation phase

            i3d.train(phase == 'train')  # eval only during validation phase
            num_iter, tot_loss, tot_loc_loss, tot_cls_loss = 0, 0.0, 0.0, 0.0
            optimizer.zero_grad()
            for data in dataloaders[phase]:  # iterate over data

                num_iter += 1
                inputs, labels = data  # get the inputs
                inputs = Variable(inputs.cuda())  # wrap them in Variable
                labels = Variable(labels.cuda())
                t = inputs.size(2)

                per_frame_logits = i3d(inputs)  # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)  # compute localization loss
                tot_loc_loss += loc_loss.data[0]

                cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data[0]  # compute classification loss (with max-pooling along time B x C x T)

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.data[0]
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':

                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:

                        print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(
                            phase, tot_loc_loss / (10 * num_steps_per_update),
                            tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10))
                        torch.save(i3d.module.state_dict(), save_model + str(steps).zfill(6)+'.pt')  # save model
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.

            if phase == 'val':
                print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(
                    phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter,
                    (tot_loss * num_steps_per_update) / num_iter))
Example #11
0
def main():
    best_prec1 = 0
    with open(
            'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
            '_validation.txt', 'a') as f:
        f.write("=============================================")
        f.write('\n')
        f.write("lr: ")
        f.write(str(args.lr))
        f.write(" lr_step: ")
        f.write(str(args.lr_steps))
        f.write(" dataset: ")
        f.write(str(args.dataset))
        f.write(" modality: ")
        f.write(str(args.mode))
        f.write(" dropout: ")
        f.write(str(args.dropout))
        f.write(" batch size: ")
        f.write(str(args.batch_size))
        f.write('\n')
    if args.dataset == 'ucf101':
        num_class = 101
        data_length = 64
        image_tmpl = "frame{:06d}.jpg"
    elif args.dataset == 'hmdb51':
        num_class = 51
        data_length = 64
        image_tmpl = "img_{:05d}.jpg"
    elif args.dataset == 'kinetics':
        num_class = 400
        data_length = 64
        image_tmpl = "img_{:05d}.jpg"
    else:
        raise ValueError('Unknown dataset ' + args.dataset)

    val_logger = Logger(
        'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
        '_val.log', ['epoch', 'acc'])
    # define loss function (criterion) and optimizer
    #======================data transform=============

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip()
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
    #=======================design the dataset==============
    train_dataset = I3dDataSet("",
                               args.train_list,
                               num_segments=1,
                               new_length=data_length,
                               modality=args.mode,
                               dataset=args.dataset,
                               image_tmpl=image_tmpl if args.mode
                               in ["rgb", "RGBDiff"] else args.flow_prefix +
                               "{}_{:05d}.jpg",
                               transform=train_transforms,
                               test_mode=False)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=8,
                                               pin_memory=True)

    val_dataset = I3dDataSet("",
                             args.val_list,
                             num_segments=1,
                             new_length=data_length,
                             modality=args.mode,
                             dataset=args.dataset,
                             image_tmpl=image_tmpl if args.mode
                             in ["rgb", "RGBDiff"] else args.flow_prefix +
                             "{}_{:05d}.jpg",
                             random_shift=False,
                             transform=test_transforms,
                             test_mode=False)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=8,
                                             pin_memory=True)

    dataloaders = {'train': train_loader, 'val': val_loader}
    datasets = {'train': train_dataset, 'val': val_dataset}

    #=============================set the model ==================
    # setup the model
    if args.mode == 'flow':
        if args.arch == 'i3d':
            from net.i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'bilinear_i3d':
            from net.bilinear_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_i3d':
            from net.se_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_bilinear_i3d':
            from net.se_bilinear_i3d import I3D
            i3d = I3D(modality='flow',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        else:
            Exception("not support now!")
        i3d.eval()
        pretrain_dict = torch.load('pretrained_models/model_flow.pth')
        model_dict = i3d.state_dict()
        weight_dict = weight_transform(model_dict, pretrain_dict)
        i3d.load_state_dict(weight_dict)
    else:
        #i3d = InceptionI3d(400, in_channels=3)
        if args.arch == 'i3d':
            from net.i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_i3d':
            from net.se_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'bilinear_i3d':
            from net.bilinear_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        elif args.arch == 'se_bilinear_i3d':
            from net.se_bilinear_i3d import I3D
            i3d = I3D(modality='rgb',
                      num_classes=num_class,
                      dropout_prob=args.dropout)
        else:
            Exception("not support now!")
        i3d.eval()
        pretrain_dict = torch.load('pretrained_models/model_rgb.pth')
        model_dict = i3d.state_dict()
        weight_dict = weight_transform(model_dict, pretrain_dict)
        i3d.load_state_dict(weight_dict)

    i3d.cuda()
    #print(i3d)
    #============================set SGD, critization and lr ==================
    optimizer = torch.optim.SGD(i3d.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                dampening=0,
                                nesterov=False)
    model = nn.DataParallel(i3d)
    criterion = torch.nn.NLLLoss().cuda()
    disturb = DisturbLabel(alpha=10, C=51)
    # criterion = FocalLoss(gamma = 0).cuda()
    #print(model)

    writer = SummaryWriter()  #create log folders for plot
    timer = Timer()
    for epoch in range(1, args.epochs):
        timer.tic()
        adjust_learning_rate(optimizer, epoch, args.lr_steps)

        # train for one epoch
        train_prec1, train_loss = train(train_loader, model, criterion,
                                        optimizer, epoch, disturb)
        writer.add_scalar('Train/Accu', train_prec1, epoch)
        writer.add_scalar('Train/Loss', train_loss, epoch)
        # evaluate on validation set
        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1, val_loss = validate(val_loader, model, criterion,
                                       (epoch + 1) * len(train_loader))
            writer.add_scalar('Val/Accu', prec1, epoch)
            writer.add_scalar('Val/Loss', val_loss, epoch)
            writer.add_scalars('data/Acc', {
                'train_prec1': train_prec1,
                'val_prec1': prec1
            }, epoch)
            writer.add_scalars('data/Loss', {
                'train_loss': train_loss,
                'val_loss': val_loss
            }, epoch)
            #scheduler.step(val_loss)
            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                }, is_best, best_prec1)
            val_logger.log({'epoch': epoch, 'acc': prec1})
        timer.toc()
        left_time = timer.average_time * (args.epochs - epoch)
        print("best_prec1 is: {}".format(best_prec1))
        print("left time is: {}".format(timer.format(left_time)))
        with open(
                'logs/' + args.dataset + '/' + args.arch + '_' + args.mode +
                '_validation.txt', 'a') as f:
            f.write(str(epoch))
            f.write(" ")
            f.write(str(train_prec1))
            f.write(" ")
            f.write(str(prec1))
            f.write(" ")
            f.write(timer.format(timer.diff))
            f.write('\n')
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Example #12
0
import torch
from pytorch_i3d import InceptionI3d
from torchvision import transforms
import videotransforms
from mit_data import MITDataset, make_label_binarizer

INDEX_FILE = "experiment/binary_class/binary_class.csv"
SPLIT_FILE = "experiment/binary_class/split.csv"

batch_size = 1
train_transforms = transforms.Compose([
    videotransforms.RandomCrop(225),
    videotransforms.RandomHorizontalFlip(),
])
test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

dataset = MITDataset(mode="train",
                     transforms=train_transforms,
                     index_file=INDEX_FILE,
                     split_file=SPLIT_FILE)

dataloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         num_workers=10,
                                         pin_memory=True)

val_dataset = MITDataset(mode="val",
                         transforms=test_transforms,
                         index_file=INDEX_FILE,
                         split_file=SPLIT_FILE)
Example #13
0
def eval(args):
    transform = transforms.Compose([videotransforms.RandomCrop(224)])

    val_dataset = Dataset(args.train_split, 'val', args.root, args.frame_nb,
                          args.interval, transform)

    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=24,  # on jobs
        pin_memory=True)
    if args.resnet_nb == 50:
        resnet = torchvision.models.resnet50(pretrained=True)
        print('load resnet50 pretrained model...')
    elif args.resnet_nb == 101:
        resnet = torchvision.models.resnet101(pretrained=True)
        print('load resnet101 pretrained model...')
    elif args.resnet_nb == 152:
        resnet = torchvision.models.resnet152(pretrained=True)
        print('load resnet152 pretrained model...')
    else:
        raise ValueError(
            'resnet_nb should be in [50|101|152] but got {}').format(
                args.resnet_nb)

    i3resnet = I3ResNet(copy.deepcopy(resnet),
                        args.frame_nb,
                        args.class_nb,
                        conv_class=True)

    state_dict = torch.load(args.model_path)

    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]  # remove 'module'.
        new_state_dict[name] = v

    i3resnet.load_state_dict(new_state_dict)
    print('loaded saved state_dict...')

    i3resnet.eval()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    i3resnet = i3resnet.to(device)
    # i3resnet = nn.DataParallel(i3resnet)

    AccuracyArr = []
    accuracy = np.zeros((1, args.class_nb))
    with torch.no_grad():
        for i, data in enumerate(val_dataloader):
            tic = time.time()
            # tic = time.time()
            # Read data

            img_cpu, label_cpu = data
            img = Variable(img_cpu.to(device))
            label = Variable(label_cpu.to(device))

            pred = i3resnet(img)

            # Calculate accuracy
            predict = torch.sigmoid(pred) > 0.5
            f1_sample = f1_score(label_cpu.data.numpy(),
                                 predict.cpu().data.numpy(),
                                 average='samples')  # here!!!
            f1 = f1_score(label_cpu.data.numpy(),
                          predict.cpu().data.numpy(),
                          average=None)

            AccuracyArr.append(f1_sample)
            accuracy = np.vstack((accuracy, f1))

            if i % 10 == 0:
                toc = time.time()
                print('validation dataset batch:', i)
                print('prediction logits:{}'.format(
                    predict.cpu().data.numpy()))
                print('ground truth:{}'.format(label_cpu.data.numpy()))
                print('f1 score:', f1_sample, 'accumulated f1 score:',
                      np.mean(np.array(AccuracyArr)))  #
                print('f1 average:', np.mean(accuracy, axis=0))
                print('Time elapsed:', toc - tic)

            torch.cuda.empty_cache()

    print("Finished Validation")
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\
        train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''):
    writer = tensorboardX.SummaryWriter()
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=3,
                                             pin_memory=True)

    val_dataset = Dataset(test_split, root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=3,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    sm = InceptionI3d(400, in_channels=3)
    sm.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    #tm = InceptionI3d(400, in_channels=2)
    #tm.load_state_dict(torch.load('models/flow_imagenet.pt'))
    sm.replace_logits(1)
    sm = freeze_network_layer(sm)
    #add your network here
    fusedNet = FusionNet(sm)
    if torch.cuda.is_available():
        fusedNet.cuda()
        fusedNet = nn.DataParallel(fusedNet)

    lr = init_lr
    optimizer = optim.SGD(fusedNet.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200])

    steps = 0
    with open('i3d_video.txt', 'w') as file:
        file.write("train and validation loss file\n")
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            print('phase : {}'.format(phase))
            if phase == 'train':
                fusedNet.train(True)
            else:
                fusedNet.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            count = 0
            optimizer.zero_grad()
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                f_vid, l_vid, tactile, pos, labels = data

                if torch.cuda.is_available():
                    inputs = Variable(f_vid.cuda())
                    t = inputs.size(2)
                    labels = Variable(labels.cuda())
                else:
                    inputs = Variable(f_vid)
                    t = inputs.size(2)
                    labels = Variable(labels)

                per_frame_logits = fusedNet(inputs.float())
                #print('prediction output = ', per_frame_logits.shape)
                #print('labels = ',labels.shape)
                # compute classification loss (with max-pooling along time B x C x T)
                per_frame_logits = per_frame_logits.squeeze(1)
                cls_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits.double(), labels.double())
                tot_cls_loss += cls_loss.item()
                cls_loss.backward()
                print('{} Loss: {:.4f} and lr: {}'.format(
                    phase, tot_cls_loss / num_iter, init_lr))
                with open('i3d_video.txt', 'a') as file:
                    file.write("%f\n" % (tot_cls_loss / num_iter))
                optimizer.step()
                optimizer.zero_grad()
                if phase == 'val':
                    writer.add_scalar('error/' + phase,
                                      (tot_cls_loss / num_iter), num_iter)
                else:
                    writer.add_scalar('error/' + phase,
                                      (tot_cls_loss / num_iter), num_iter)
                    if (steps % 50 == 0):
                        torch.save(
                            fusedNet.module.state_dict(),
                            save_model + phase + str(steps).zfill(6) + '.pt')
                        save_checkpoint(fusedNet, optimizer, lr_sched, steps)
            #save error at every epoch
            writer.add_scalar('errorAtEpoch/' + phase,
                              (tot_cls_loss / num_iter), steps)
            tot_cls_loss = 0.
        #if(steps%50 == 0):
        #    torch.save(fusedNet.module.state_dict(), save_model+phase+str(steps).zfill(6)+'.pt')
        #    save_checkpoint(fusedNet, optimizer, lr_sched, steps)
        steps += 1
        lr_sched.step()
Example #15
0
def run(num_vids,
        init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        val_split='charades/charades.json',
        batch_size=8 * 5,
        save_model='',
        num_classes=2):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    val_dataset = Dataset(val_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=0,
                                                 pin_memory=True)

    dataloaders = {'train': None, 'val': val_dataloader}
    datasets = {'train': None, 'val': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)

    i3d.replace_logits(2)
    i3d.load_state_dict(torch.load(save_model))

    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['val']:
            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode

            tot_acc = 0.0
            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                print(num_iter)
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                logits, _ = torch.max(per_frame_logits, dim=2)
                labels, _ = torch.max(labels, dim=2)

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(logits, labels)
                tot_cls_loss += cls_loss.item()

                loss = (cls_loss) / num_steps_per_update
                tot_loss += loss.item()
                loss.backward()

                predictions = torch.nn.Softmax(dim=-1)(logits)

                bin_predictions = predictions >= 0.5

                acc = (bin_predictions *
                       labels.byte()).float().sum() / batch_size
                tot_acc += acc

                if num_iter % 1 == 0:
                    print("{}/1500".format(num_iter))

            if phase == 'val':
                print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.
                      format(phase, tot_loc_loss / num_iter,
                             tot_cls_loss / num_iter,
                             (tot_loss * num_steps_per_update) / num_iter))
def train(args):
    # Init wandb
    run = wandb.init(name=args.save_dir[len('../runs/'):],
                     config=args,
                     project='sign-language-recognition')

    # Create directory for model checkpoints and log
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    # Save args
    with open(os.path.join(args.save_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=2)

    # Logger
    logger = create_logger(args.save_dir)

    # Set gpu
    if torch.cuda.is_available():
        i = get_free_gpu()
        device = get_device(gpu=i)
    else:
        device = 'cpu'
    logger.info('using device: {}'.format(device))

    # Prepare early stop
    stopped = False
    best_epoch = 0
    best_loss = torch.Tensor([float('Inf')])

    # Data

    if args.freeze_vgg:
        real_batch_size = 3
    else:
        real_batch_size = 2  # can't fit more into gpu memory

    json_file = os.path.join(args.data_path, 'WLASL_v0.3.json')
    videos_folder = os.path.join(args.data_path, 'videos')
    keypoints_folder = os.path.join(args.data_path, 'keypoints')
    train_transforms = transforms.Compose([videotransforms.RandomCrop(224)])
    val_transforms = train_transforms

    # Debug data
    if args.debug_dataset:
        train_dataset = WLASL(json_file=json_file,
                              videos_folder=videos_folder,
                              keypoints_folder=keypoints_folder,
                              transforms=train_transforms,
                              split='train',
                              subset=args.subset)
        train_dl = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=real_batch_size,
                                               sampler=DebugSampler(
                                                   args.debug_dataset,
                                                   len(train_dataset)))
        val_dl = train_dl
    else:
        train_dataset = WLASL(json_file=json_file,
                              videos_folder=videos_folder,
                              keypoints_folder=keypoints_folder,
                              transforms=train_transforms,
                              split='train',
                              subset=args.subset)
        train_dl = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=real_batch_size,
                                               shuffle=True)

        val_dataset = WLASL(json_file=json_file,
                            videos_folder=videos_folder,
                            keypoints_folder=keypoints_folder,
                            transforms=val_transforms,
                            split='val',
                            subset=args.subset)
        val_dl = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=real_batch_size,
                                             shuffle=True)
    logger.info('data loaded')

    # Model, loss, optimizer
    m = Conv2dRNN(args).to(device)
    optimizer = torch.optim.Adam(m.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss()

    # Resume train
    start_epoch = 0
    if args.resume_train:
        checkpoint = torch.load(os.path.join(args.save_dir,
                                             'checkpoint.pt.tar'),
                                map_location=torch.device('cpu'))
        best_epoch = checkpoint['epoch']
        m.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        m = m.to(device)
        best_loss = checkpoint['best_val_loss']
        start_epoch = best_epoch + 1

        # Change learning rate
        for g in optimizer.param_groups:
            g['lr'] = args.lr

        logger.info(
            'Resuming training from epoch {} with best loss {:.4f}'.format(
                start_epoch, best_loss))

    # learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.lr_schedule_factor,
        patience=args.lr_schedule_patience,
        threshold=args.lr_schedule_threshold)

    # Watch model with wandb
    run.watch(m, log='all', log_freq=5)

    # Print args
    logger.info('using args: \n' +
                json.dumps(vars(args), sort_keys=True, indent=2))

    # Train loop
    for t in range(args.n_epochs):
        t += start_epoch
        # Train
        losses = AverageMeter()
        batch_time = AverageMeter()
        m.train()

        start_t = time.time()
        for i, batch in enumerate(train_dl):

            # Run the forward pass multiple times and accumulate gradient (to be able to use large batch size)
            X = batch['X'].to(device)
            label = batch['label'].to(device)

            # [per frame logits, mean of all frames logits]
            logits = m(X)

            # Create label for each logit
            label = torch.cat([l.repeat(logits.shape[1], 1) for l in label],
                              dim=0)

            # Squeeze time sequence and batch into one dimension
            logits = logits.reshape(logits.shape[0] * logits.shape[1],
                                    logits.shape[2])

            loss = criterion(logits, label.squeeze())
            loss.backward()
            losses.update(loss.item())

            if (i % (args.batch_size // real_batch_size)) == 0:
                # Optimize with accumulated gradient
                optimizer.step()
                optimizer.zero_grad()

                batch_time.update(time.time() - start_t)
                start_t = time.time()

        train_loss = losses.avg

        # Validate
        with torch.no_grad():
            top1 = AverageMeter()
            top5 = AverageMeter()
            top10 = AverageMeter()
            losses = AverageMeter()

            m.eval()
            for batch in val_dl:
                X = batch['X'].to(device)
                label = batch['label'].to(device)

                # [per frame logits, mean of all frames logits]
                logits = m(X)

                # Create label for each logit
                label = torch.cat(
                    [l.repeat(logits.shape[1], 1) for l in label], dim=0)

                # Squeeze time sequence and batch into one dimension
                logits = logits.reshape(logits.shape[0] * logits.shape[1],
                                        logits.shape[2])

                losses.update(criterion(logits, label.squeeze()).item())

                # Update metrics
                acc1, acc5, acc10 = topk_accuracy(logits,
                                                  label,
                                                  topk=(1, 5, 10))
                top1.update(acc1.item())
                top5.update(acc5.item())
                top10.update(acc10.item())

            val_loss = losses.avg

            # Save best model
            if val_loss < best_loss:
                best_loss, best_epoch = val_loss, t
                save_best(args, t, m, optimizer, best_loss)

            # Check early stop
            if t >= best_epoch + args.early_stop:
                logger.info('EARLY STOP')
                break

        # Log info
        logger.info(
            'epoch: {} train loss: {:.4f} val loss: {:.4f} top1acc {:.4f} top5acc {:.4f} top10acc {:.4f} lr: {:.2e} time per batch {:.1f} s'
            .format(t + 1, train_loss, val_loss, top1.avg, top5.avg, top10.avg,
                    optimizer.param_groups[0]['lr'], batch_time.avg))

        # Wandb log
        run.log({
            'train_loss': train_loss,
            'val_loss': val_loss,
            'top1_acc': top1.avg,
            'top5_acc': top5.avg,
            'top10_acc': top10.avg,
            'lr': optimizer.param_groups[0]['lr']
        })

        # Scheduler step
        if args.use_lr_scheduler:
            scheduler.step(val_loss)
Example #17
0
def run(num_vids,
        sync_bn=True,
        init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        val_split='charades/charades.json',
        batch_size=8 * 5,
        save_model='',
        num_classes=2):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.Tile(64),
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, 'training', root, mode, train_transforms)

    sampler = torch.utils.data.distributed.DistributedSampler(
        dataset,
        num_replicas=args.ngpu,
        rank=args.local_rank,
    )

    def remove_bad_vids(batch):
        batch = list(filter(lambda x: x is not None, batch))
        return torch.utils.data.dataloader.default_collate(batch)

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             num_workers=2,
                                             pin_memory=True,
                                             sampler=sampler,
                                             drop_last=True,
                                             collate_fn=remove_bad_vids)

    val_dataset = Dataset(val_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=2,
                                                 pin_memory=True,
                                                 drop_last=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    i3d.replace_logits(2)

    if sync_bn:
        print("Using SyncBatchNorm")
        i3d = torch.nn.SyncBatchNorm.convert_sync_batchnorm(i3d)

    if args.phase == 'val':
        print("Loading model {}".format(args.save_model))
        i3d.load_state_dict(torch.load(args.save_model))

    i3d.cuda()

    i3d = torch.nn.parallel.DistributedDataParallel(
        i3d,
        device_ids=[args.local_rank],
        output_device=args.local_rank,
    )

    # Send model to its device
    device = torch.device('cuda:{}'.format(args.local_rank))
    i3d = i3d.to(device)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        if args.local_rank == 0:
            print('Step {}/{}'.format(steps, max_steps))
            print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in [args.phase]:  #, 'val']:
            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode
                if args.local_rank != 0: break

            tot_acc = 0.0
            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            # Iterate over data.
            #for data in dataloaders[phase]:
            for it, data in tqdm(enumerate(dataloaders[phase]),
                                 total=num_vids // (args.ngpu * batch_size),
                                 ncols=100):
                num_iter += 1

                inputs, labels = data

                # Send input to device
                inputs, labels = inputs.to(device), labels.to(device)

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                logits, _ = torch.max(per_frame_logits, dim=2)
                labels, _ = torch.max(labels, dim=2)

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(logits, labels)
                tot_cls_loss += cls_loss.item()

                loss = (cls_loss) / num_steps_per_update
                tot_loss += loss.item()
                loss.backward()

                predictions = torch.nn.Softmax(dim=-1)(logits)

                bin_predictions = predictions >= 0.5

                acc = (bin_predictions *
                       labels.byte()).float().sum() / batch_size
                tot_acc += acc
                if num_iter % 10 == 0 and phase == 'val':
                    print(
                        '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'
                        .format(phase, tot_acc / num_iter,
                                tot_loc_loss / num_iter,
                                tot_cls_loss / num_iter,
                                (tot_loss * num_steps_per_update) / num_iter))

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if args.local_rank == 0 and steps % 10 == 0:
                        print(
                            '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'
                            .format(phase, tot_acc / (num_steps_per_update),
                                    tot_loc_loss / (10 * num_steps_per_update),
                                    tot_cls_loss / (10 * num_steps_per_update),
                                    tot_loss / 10))
                        # save model
                        os.system('mkdir -p {}'.format(save_model))
                        torch.save(
                            i3d.module.state_dict(),
                            os.path.join(save_model,
                                         str(steps).zfill(6)) + '.pt')
                        tot_acc = tot_loss = tot_loc_loss = tot_cls_loss = 0.

            if phase == 'val':
                print(
                    '{} Accuracy: {:.4f} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'
                    .format(phase, tot_acc / num_iter, tot_loc_loss / num_iter,
                            tot_cls_loss / num_iter,
                            (tot_loss * num_steps_per_update) / num_iter))
Example #18
0
def run(init_lr=0.1,
        max_steps=64e3,
        mode='rgb',
        root='/storage/truppr/CHARADES/Charades_v1_rgb',
        train_split='charades/charades.json',
        batch_size=16,
        save_model=''):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    # print(root)
    print("creating training set...")
    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=18,
                                             pin_memory=True)

    print("creating validation set...")
    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=18,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    print("setting up the model...")
    if mode == 'flow' or mode == 'rgb':
        if mode == 'flow':
            i3d = InceptionI3d(400, in_channels=2)
            i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
        elif mode == 'rgb':
            i3d = InceptionI3d(400, in_channels=3)
            i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
        i3d.replace_logits(157)  # number of classes... originally 157
        i3d.cuda(0)
        i3d = nn.DataParallel(i3d)

    elif mode == 'both':
        i3d_rgb = InceptionI3d(400, in_channels=3)
        i3d_rgb.load_state_dict(torch.load('models/rgb_imagenet.pt'))

        i3d_flow = InceptionI3d(400, in_channels=2)
        i3d_flow.load_state_dict(torch.load('models/flow_imagenet.pt'))

        i3d_rgb.replace_logits(157)  # number of classes... originally 157
        i3d_flow.replace_logits(157)

        i3d_rgb.cuda(0)
        i3d_flow.cuda(0)

        i3d_rgb = nn.DataParallel(i3d_rgb)
        i3d_flow = nn.DataParallel(i3d_flow)

    lr = init_lr

    if mode == 'both':
        optimizer_rgb = optim.SGD(i3d_rgb.parameters(),
                                  lr=lr,
                                  momentum=0.9,
                                  weight_decay=0.0000001)
        optimizer_flow = optim.SGD(i3d_flow.parameters(),
                                   lr=lr,
                                   momentum=0.9,
                                   weight_decay=0.0000001)
        lr_sched_rgb = optim.lr_scheduler.MultiStepLR(optimizer_rgb,
                                                      [300, 1000])
        lr_sched_flow = optim.lr_scheduler.MultiStepLR(optimizer_flow,
                                                       [300, 1000])
    else:
        optimizer = optim.SGD(i3d.parameters(),
                              lr=lr,
                              momentum=0.9,
                              weight_decay=0.0000001)
        lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        # print 'Step {}/{}'.format(steps, max_steps)
        # print '-' * 10
        print('Step ' + str(steps) + '/' + str(max_steps))
        print('-' * 25)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                print("training model...")
                if mode == 'both':
                    i3d_rgb.train(True)
                    i3d_flow.train(True)
                    optimizer_rgb.zero_grad()
                    optimizer_flow.zero_grad()
                else:
                    i3d.train(True)
                    optimizer.zero_grad()
            else:
                print("validating model...")
                if mode == 'both':
                    i3d_rgb.train(False)
                    i3d_flow.train(False)
                    optimizer_rgb.zero_grad()
                    optimizer_flow.zero_grad()
                else:
                    i3d.train(False)  # Set model to evaluate mode
                    optimizer.zero_grad()

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            # optimizer.zero_grad()
            print("zeroed...")
            # print(len(dataloaders["train"]))
            # print(dataloaders["train"])
            # Iterate over data.
            for data in dataloaders[phase]:
                # print("starting iter...")

                num_iter += 1
                # get the inputs
                inputs, labels = data

                print("data size: ", inputs.shape, " label: ", labels)

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                torch.set_printoptions(profile="full")
                print("labels:\n", labels)
                print("labels:\n", labels.shape)
                print("Inputs: \n", inputs.shape)
                torch.set_printoptions(profile="default")

                if mode == 'both':
                    per_frame_logits = i3d_rgb(inputs)
                    per_flows_logits = i3d_flow(flow_inputs)
                else:
                    per_frame_logits = i3d(inputs)

                    # upsample to input size
                    per_frame_logits = F.upsample(per_frame_logits,
                                                  t,
                                                  mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels)
                tot_loc_loss += loc_loss.item()

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(
                    torch.max(per_frame_logits, dim=2)[0],
                    torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.item()

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.item()
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        # print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)
                        print(
                            str(phase) + ' Loc Loss: ' +
                            str(tot_loc_loss / (10 * num_steps_per_update)) +
                            ' Cls Loss: ' + str(tot_cls_loss /
                                                (10 * num_steps_per_update)) +
                            ' Tot Loss: ' + str(tot_loss / 10))
                        # save model
                        torch.save(
                            i3d.module.state_dict(),
                            save_model + str(steps).zfill(6) + '-' +
                            str(tot_loss / 10) + '.pt')
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
                #else:
                #    print(str(phase) + ' Loc Loss: ' + str(tot_loc_loss/(10*num_steps_per_update)) + ' Cls Loss: ' + str(tot_cls_loss/(10*num_steps_per_update)) + ' Tot Loss: ' + str(tot_loss/10))

            if phase == 'val':
                print(
                    str(phase) + ' Loc Loss: ' +
                    str(tot_loc_loss / num_iter).zfill(4) + ' Cls Loss: ' +
                    str(tot_cls_loss / num_iter).zfill(4) + ' Tot Loss: ' +
                    str((tot_loss * num_steps_per_update) / num_iter).zfill(4))
                # print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter)
            print("whoops...")
Example #19
0
def run(init_lr=0.0001, max_steps=64e3, frames_per_clip=16, dataset_path='/media/sitzikbs/6TB/ANU_ikea_dataset/',
        train_filename='train_cross_env.txt', testset_filename='test_cross_env.txt',
        db_filename='../ikea_dataset_frame_labeler/ikea_annotation_db', logdir='',
        frame_skip=1, batch_size=8, camera='dev3', refine=False, refine_epoch=0, load_mode='vid', input_type='rgb',
        model_name='c3d'):


    os.makedirs(logdir, exist_ok=True)

    # setup dataset
    img_size = 112 if model_name == 'c3d' else 160 #224
    train_transforms = transforms.Compose([videotransforms.RandomCrop(img_size),
                                           videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(img_size)])

    train_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename,
                 transform=train_transforms, set='train', camera=camera, frame_skip=frame_skip,
                            frames_per_clip=frames_per_clip, resize=None, mode=load_mode, input_type=input_type)
    print("Number of clips in the dataset:{}".format(len(train_dataset)))
    weights = utils.make_weights_for_balanced_classes(train_dataset.clip_set, train_dataset.clip_label_count)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler,
                                                   num_workers=6, pin_memory=False)

    test_dataset = Dataset(dataset_path, db_filename=db_filename, train_filename=train_filename,
                           test_filename=testset_filename, transform=test_transforms, set='test', camera=camera,
                           frame_skip=frame_skip, frames_per_clip=frames_per_clip, resize=None, mode=load_mode,
                           input_type=input_type)

    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=6,
                                                  pin_memory=False)

    # setup the model
    num_classes = train_dataset.num_classes
    if model_name == 'c3d':
        model = c3d.C3D()
        model.load_state_dict(torch.load('c3d.pickle'))
        model.replace_logits(num_classes)
    elif model_name == 'p3d':
        model = p3d.P3D199(pretrained=True, modality='RGB', num_classes=num_classes)
    else:
        raise ValueError("unsupported model")

    if refine:
        if refine_epoch == 0:
            raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.")
        refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6)+'.pt')
        checkpoint = torch.load(refine_model_filename)
        model.load_state_dict(checkpoint["model_state_dict"])

    model.cuda()
    model = nn.DataParallel(model)

    lr = init_lr
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1E-6)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 20, 30, 40])
    criterion = nn.CrossEntropyLoss()  # standard crossentropy loss for classification

    if refine:
        lr_sched.load_state_dict(checkpoint["lr_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    train_writer = SummaryWriter(os.path.join(logdir, 'train'))
    test_writer = SummaryWriter(os.path.join(logdir, 'test'))

    num_steps_per_update = 4  # accum gradient - try to have number of examples per update match original code 8*5*4
    # eval_steps  = 5
    steps = 0
    # train it
    n_examples = 0
    train_num_batch = len(train_dataloader)
    test_num_batch = len(test_dataloader)
    refine_flag = True

    while steps < max_steps:#for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)
        if steps <= refine_epoch and refine and refine_flag:
            lr_sched.step()
            steps += 1
            n_examples += len(train_dataset.clip_set)
            continue
        else:
            refine_flag = False
        # Each epoch has a training and validation phase

        test_batchind = -1
        test_fraction_done = 0.0
        test_enum = enumerate(test_dataloader, 0)
        tot_loss = 0.0
        num_iter = 0
        optimizer.zero_grad()

        # Iterate over data.
        avg_acc = []
        for train_batchind, data in enumerate(train_dataloader):

            num_iter += 1
            # get the inputs
            inputs, labels, vid_idx, frame_pad = data

            # wrap them in Variable
            inputs = Variable(inputs.cuda(), requires_grad=True)
            labels = Variable(labels.cuda())
            labels = torch.argmax(labels, dim=1)

            logits = model(inputs)
            t = inputs.size(2)
            per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear', align_corners=True)
            probs = torch.nn.functional.softmax(per_frame_logits, dim=1)
            preds = torch.max(probs, 1)[1]

            loss = criterion(per_frame_logits, labels)
            tot_loss += loss.item()
            loss.backward()

            acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels)

            avg_acc.append(acc.item())
            train_fraction_done = (train_batchind + 1) / train_num_batch
            print('[{}] train Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), train_batchind, len(train_dataloader)))
            if (num_iter == num_steps_per_update or train_batchind == len(train_dataloader)-1) :
                n_steps = num_steps_per_update
                if train_batchind == len(train_dataloader)-1:
                    n_steps = num_iter
                n_examples += batch_size*n_steps
                print('updating the model...')
                print('train Total Loss: {:.4f}'.format(tot_loss / n_steps))
                optimizer.step()
                optimizer.zero_grad()
                train_writer.add_scalar('loss', tot_loss / n_steps, n_examples)
                train_writer.add_scalar('Accuracy', np.mean(avg_acc), n_examples)
                train_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], n_examples)
                num_iter = 0
                tot_loss = 0.

            if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch:
                model.train(False)  # Set model to evaluate mode
                test_batchind, data = next(test_enum)
                inputs, labels, vid_idx, frame_pad = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda(), requires_grad=True)
                labels = Variable(labels.cuda())
                labels = torch.argmax(labels, dim=1)

                with torch.no_grad():
                    logits = model(inputs)
                    t = inputs.size(2)
                    per_frame_logits = torch.nn.functional.interpolate(logits.unsqueeze(-1), t, mode='linear',
                                                                       align_corners=True)
                    probs = torch.nn.functional.softmax(per_frame_logits, dim=1)
                    preds = torch.max(probs, 1)[1]

                    loss = criterion(per_frame_logits, labels)
                    acc = utils.accuracy_v2(torch.argmax(per_frame_logits, dim=1), labels)


                print('[{}] test Acc: {}, Loss: {:.4f} [{} / {}]'.format(steps, acc.item(), loss.item(), test_batchind,
                                                                     len(test_dataloader)))
                test_writer.add_scalar('loss', loss.item(), n_examples)
                test_writer.add_scalar('Accuracy', acc.item(), n_examples)
                test_fraction_done = (test_batchind + 1) / test_num_batch
                model.train(True)
        if steps % 2 == 0:
            # save model
            torch.save({"model_state_dict": model.module.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "lr_state_dict": lr_sched.state_dict()},
                       logdir + str(steps).zfill(6) + '.pt')

        steps += 1
        lr_sched.step()
    train_writer.close()
    test_writer.close()
Example #20
0
def run(init_lr=0.1, max_step=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=8*5, save_model=''):
    # setup dataset
    train_transforms = transforms.Compose([videotransforms.RandomCrop(224),
                                           videotransforms.RandomHorisontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.RandomCrop(224)])

    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)

    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True)    

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    i3d.replace_logits(157)
    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])


    num_steps_per_update = 4 # accum gradient
    steps = 0
    # train it
    while steps < max_steps:#for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode
                
            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()
            
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                inputs = Variable(inputs.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())

                per_frame_logits = i3d(inputs)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
                tot_loc_loss += loc_loss.data[0]

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data[0]

                loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
                tot_loss += loss.data[0]
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    lr_sched.step()
                    if steps % 10 == 0:
                        print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)
                        # save model
                        torch.save(i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'val':
                print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) 
def run(configs,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        save_model='',
        weights=None,
        datasets=None):
    print(configs)

    # setup dataset
    ### Standard torch augmentation methods for the data
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    ## Create the dataset for internal representation - open up the file for
    ## more details
    ## Returns a torch.utils.data.Dataset instance
    if datasets is None:
        print('Setting up training dataset...')
        dataset = Dataset(train_split, 'train', root, mode, train_transforms)

        # Same for test dataset
        print('Setting up validation dataset...')
        val_dataset = Dataset(train_split, 'test', root, mode, test_transforms)
    else:
        print('Loading in datasets...')
        dataset = datasets['train']
        val_dataset = datasets['test']

    # Store data loaders and datasets
    # Create torch dataloader

    ### Set num workers to 0 due to docker bug
    ## https://github.com/pytorch/pytorch/issues/1355#issuecomment-555091916
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=configs.batch_size,
                                             shuffle=True,
                                             num_workers=0,
                                             pin_memory=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=configs.batch_size,
                                                 shuffle=True,
                                                 num_workers=0,
                                                 pin_memory=False)
    dataloaders = {
        'train': dataloader,
        'test': val_dataloader
    }  #### what is this dic doing?
    datasets = {'train': dataset, 'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        ## Set up model with 400 classes and 3 input channels
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))

    # Replace last layer from 400 to 2000 neurons because the
    # pretrained weights had 400 output neurons to begin with.
    # Load in the model, then replace last layer with 2000 neurons
    # for transfer learning
    num_classes = dataset.num_classes
    i3d.replace_logits(num_classes)

    if weights:
        print('loading weights {}'.format(weights))
        i3d.load_state_dict(torch.load(weights))

    # Transfer to CUDA and make training parallel
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    # Get LR, weight decay and set up optimizer
    lr = configs.init_lr
    weight_decay = configs.adam_weight_decay
    optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay)

    # Should just be 1
    num_steps_per_update = configs.update_per_step  # accum gradient
    steps = 0
    epoch = 0

    best_val_score = 0
    # train it

    ## LR scheduler
    # Monitoring trend in loss, so we need to look at min / decreasing
    # patience=5 means number of epochs where if there is no improvement, reduce
    # the LR
    # Factor is to decrease the learning rate by this much if there is no
    # improvement
    # new_lr = factor * old_lr
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'min',
                                                           patience=5,
                                                           factor=0.3)

    # Stores sparse cross entropy loss
    ce_loss = nn.CrossEntropyLoss()
    ce2_loss = nn.CrossEntropyLoss()
    num_epochs = 400
    while steps < configs.max_steps and epoch < num_epochs:  # for epoch in range(num_epochs):
        print('Epoch #{}/{}'.format(epoch + 1, num_epochs))
        print()
        print('Step {}/{}'.format(steps, configs.max_steps))
        print('-' * 10)

        epoch += 1
        # Each epoch has a training and validation(test?) phase
        for phase in ['train', 'test']:
            #collected_vids = []

            if phase == 'train':
                i3d.train(True)  ####i3d? --> Line #93
            else:
                i3d.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            confusion_matrix = np.zeros((num_classes, num_classes),
                                        dtype=np.int)
            # Iterate over data.
            ### Batch
            ## batch_size x num_channels x time_steps x 224 x 224
            # num_channels - 3 - RGB
            # time_steps - 64 consecutive frames
            # 224 x 224 is the size of the frame
            num_steps_loader = len(dataloaders[phase])
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                if data == -1:  # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded
                    continue

                # inputs, labels, vid, src = data
                # labels - batch_size where each element is between 0 and 1999
                inputs, labels, vid = data

                # If there are no examples, continue
                if inputs.shape[0] == 0:
                    continue

                gt_numpy = labels[:, 0]

                # Remove faulty videos
                inputs = inputs[gt_numpy != -1]
                labels = labels[gt_numpy != -1]

                # wrap them in Variable
                inputs = inputs.cuda()
                # batch_size x 3 x 64 x 224 x 224
                # batch_size = 6 in this case
                # Each example in the batch is one action
                # 64 frames to describe said gloss
                t = inputs.size(2)  #### ??

                # labels - batch_size x 64
                # Each row contains the action for the 64 frames in one video
                labels = labels.cuda()

                ### Perform forward prop
                per_frame_logits = i3d(inputs, pretrained=False)
                ## Input size - batch_size x num_channels x time_steps x 224 x 224
                ### Output size - batch_size x num_classes x 7 x 1 x 1
                ### 6 x 2000 x 7 x 1 x 1
                ### Dimensions are squeezed out
                # upsample to input size
                ## Because we are pooling in 3D we will not have 64 frames to
                # deal with so upsample so that the output tensor is
                # batch_size x 2000 x 64
                # Predicted outputs - predicting the action per frame per example
                #per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')

                # Replaced due to deprecation
                per_frame_logits = F.interpolate(per_frame_logits,
                                                 t,
                                                 mode='linear',
                                                 align_corners=True)  #### ??

                # compute localization loss
                ## This is computing the loss over all 64 frames
                #loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
                loc_loss = ce_loss(per_frame_logits, labels)
                tot_loc_loss += loc_loss.data.item()

                # Along the 64 frames for a gloss, find the best performing
                # one
                # Going from batch_size x 2000 x 64
                # Down to batch_size x 2000 --> You are assigning the best
                # probability for that particular action to happen
                # Matrix M - batch_size x 2000
                #
                # M(i, j) --> For video i, what is the probability that action j
                # occurs
                predictions = torch.max(per_frame_logits, dim=2)[0]

                # Find batch_size x 2000 tensor with each location telling
                # us the probability of a particular gloss for a frame
                #gt = torch.max(labels, dim=2)[0]
                # Recall that labels is batch_size x 64
                # Because each row is the same ID for all columns, just get the first column
                gt = labels[..., 0]

                # compute classification loss (with max-pooling along time B x C
                # x T)
                ### why didn't they just use predictions and gt?
                ## Using the most confident prediction
                #cls_loss = F.binary_cross_entropy_with_logits(predictions, gt)
                cls_loss = ce2_loss(predictions, gt)
                tot_cls_loss += cls_loss.data.item()

                # Calculate confusion matrix - row is the ground truth
                # column is the prediction
                # Used to calculate the per class accuracy
                for i in range(per_frame_logits.shape[0]):
                    confusion_matrix[gt[i],
                                     torch.argmax(predictions[i]).item()] += 1
                    #confusion_matrix[torch.argmax(gt[i]).item(), torch.argmax(predictions[i]).item()] += 1

                # Loss is a combination of the localization loss and
                # classification loss
                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.data.item()
                if num_iter == num_steps_per_update // 2:
                    print(epoch, steps, loss.data.item())
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    # lr_sched.step()
                    if steps % 10 == 0:
                        acc = float(np.trace(confusion_matrix)) / np.sum(
                            confusion_matrix)
                        print('Step {}/{} within epoch - max steps: {}'.format(
                            steps, num_steps_loader, configs.max_steps))
                        print(
                            'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                            .format(epoch, phase,
                                    tot_loc_loss / (10 * num_steps_per_update),
                                    tot_cls_loss / (10 * num_steps_per_update),
                                    tot_loss / 10, acc))
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'test':
                val_score = float(
                    np.trace(confusion_matrix)) / np.sum(confusion_matrix)
                if val_score > best_val_score or epoch % 2 == 0:
                    best_val_score = val_score
                    model_name = save_model + "nslt_" + str(
                        num_classes) + "_" + str(steps).zfill(
                            6) + '_%3f.pt' % val_score

                    torch.save(i3d.module.state_dict(), model_name)
                    print(model_name)

                print(
                    'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                    .format(phase, tot_loc_loss / num_iter,
                            tot_cls_loss / num_iter,
                            (tot_loss * num_steps_per_update) / num_iter,
                            val_score))

                scheduler.step(tot_loss * num_steps_per_update / num_iter)
Example #22
0
def run(configs,
        mode='rgb',
        root='/ssd/Charades_v1_rgb',
        train_split='charades/charades.json',
        save_model='',
        num_classes=None,
        weights=None):
    print(configs)

    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split,
                      'train',
                      root,
                      mode,
                      num_classes=num_classes,
                      transforms=train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=configs.batch_size,
                                             shuffle=True,
                                             num_workers=4,
                                             pin_memory=True)

    val_dataset = Dataset(train_split,
                          'test',
                          root,
                          mode,
                          num_classes=num_classes,
                          transforms=test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=configs.batch_size,
                                                 shuffle=True,
                                                 num_workers=4,
                                                 pin_memory=False)

    dataloaders = {'train': dataloader, 'test': val_dataloader}
    datasets = {'train': dataset, 'test': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('weights/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt'))

    num_classes = dataset.num_classes
    i3d.replace_logits(num_classes)

    if weights:
        print('loading weights {}'.format(weights))
        i3d.load_state_dict(torch.load(weights))

    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = configs.init_lr
    weight_decay = configs.adam_weight_decay
    optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay)

    num_steps_per_update = configs.update_per_step  # accum gradient
    steps = 0
    epoch = 0

    best_val_score = 0
    # train it
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'min',
                                                           patience=5,
                                                           factor=0.3)
    while steps < configs.max_steps and epoch < 400:  # for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, configs.max_steps))
        print('-' * 10)

        epoch += 1
        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            collected_vids = []

            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            optimizer.zero_grad()

            confusion_matrix = np.zeros((num_classes, num_classes),
                                        dtype=np.int)
            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                if data == -1:  # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded
                    continue

                # inputs, labels, vid, src = data
                inputs, labels, vid = data

                # wrap them in Variable
                inputs = inputs.cuda()
                t = inputs.size(2)
                labels = labels.cuda()

                per_frame_logits = i3d(inputs, pretrained=False)
                # upsample to input size
                per_frame_logits = F.upsample(per_frame_logits,
                                              t,
                                              mode='linear')

                # compute localization loss
                loc_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits, labels)
                tot_loc_loss += loc_loss.data.item()

                predictions = torch.max(per_frame_logits, dim=2)[0]
                gt = torch.max(labels, dim=2)[0]

                # compute classification loss (with max-pooling along time B x C x T)
                cls_loss = F.binary_cross_entropy_with_logits(
                    torch.max(per_frame_logits, dim=2)[0],
                    torch.max(labels, dim=2)[0])
                tot_cls_loss += cls_loss.data.item()

                for i in range(per_frame_logits.shape[0]):
                    confusion_matrix[torch.argmax(gt[i]).item(),
                                     torch.argmax(predictions[i]).item()] += 1

                loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update
                tot_loss += loss.data.item()
                if num_iter == num_steps_per_update // 2:
                    print(epoch, steps, loss.data.item())
                loss.backward()

                if num_iter == num_steps_per_update and phase == 'train':
                    steps += 1
                    num_iter = 0
                    optimizer.step()
                    optimizer.zero_grad()
                    # lr_sched.step()
                    if steps % 10 == 0:
                        acc = float(np.trace(confusion_matrix)) / np.sum(
                            confusion_matrix)
                        print(
                            'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                            .format(epoch, phase,
                                    tot_loc_loss / (10 * num_steps_per_update),
                                    tot_cls_loss / (10 * num_steps_per_update),
                                    tot_loss / 10, acc))
                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
            if phase == 'test':
                val_score = float(
                    np.trace(confusion_matrix)) / np.sum(confusion_matrix)
                if val_score > best_val_score or epoch % 2 == 0:
                    best_val_score = val_score
                    model_name = save_model + "nslt_" + str(
                        num_classes) + "_" + str(steps).zfill(
                            6) + '_%3f.pt' % val_score

                    torch.save(i3d.module.state_dict(), model_name)
                    print(model_name)

                print(
                    'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}'
                    .format(phase, tot_loc_loss / num_iter,
                            tot_cls_loss / num_iter,
                            (tot_loss * num_steps_per_update) / num_iter,
                            val_score))

                scheduler.step(tot_loss * num_steps_per_update / num_iter)
Example #23
0
def run(init_lr=0.0001, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\
        train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''):
    # setup dataset
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.RandomHorizontalFlip(),
    ])
    test_transforms = transforms.Compose([videotransforms.CenterCrop(224)])

    dataset = Dataset(train_split, root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=3,
                                             pin_memory=True)

    val_dataset = Dataset(test_split, root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=3,
                                                 pin_memory=True)

    dataloaders = {'train': dataloader, 'val': val_dataloader}
    datasets = {'train': dataset, 'val': val_dataset}

    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('models/rgb_imagenet.pt'))
    i3d.replace_logits(1)
    #     #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    #     lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [100, 140])

    num_steps_per_update = 4  # accum gradient
    steps = 0
    with open('i3d_video.txt', 'w') as file:
        file.write("train and validation loss file\n")
    # train it
    while steps < max_steps:  #for epoch in range(num_epochs):
        print('Step {}/{}'.format(steps, max_steps))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            print('phase : {}'.format(phase))
            if phase == 'train':
                i3d.train(True)
            else:
                i3d.train(False)  # Set model to evaluate mode

            tot_loss = 0.0
            tot_loc_loss = 0.0
            tot_cls_loss = 0.0
            num_iter = 0
            count = 0
            optimizer.zero_grad()

            # Iterate over data.
            for data in dataloaders[phase]:
                num_iter += 1
                # get the inputs
                f_vid, l_vid, tactile, pos, labels = data

                # wrap them in Variable
                #               inputs = Variable(f_vid)
                #               t = inputs.size(2)
                #               labels = Variable(labels)
                inputs = Variable(f_vid.cuda())
                t = inputs.size(2)
                labels = Variable(labels.cuda())
                # print("go to i3d")
                per_frame_logits = i3d(inputs.float())
                # print('Output = ', per_frame_logits.shape)
                # upsample to input size
                #per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
                per_frame_logits = F.interpolate(per_frame_logits,
                                                 t,
                                                 mode='linear',
                                                 align_corners=False)
                #per_frame_logits = F.interpolate(per_frame_logits, 7, mode='linear', align_corners=False)
                #per_frame_logits = per_frame_logits.squeeze(2)
                #per_frame_logits = per_frame_logits.squeeze(1)
                #print('Output after interpolation = ', per_frame_logits.shape)
                #print('labels = ',labels.shape)
                # compute localization loss
                #loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels.float())
                #tot_loc_loss += loc_loss.data[0]
                #print("computing classification loss")
                # compute classification loss (with max-pooling along time B x C x T)
                per_frame_logits = torch.max(per_frame_logits, dim=2)[0]
                per_frame_logits = per_frame_logits.squeeze(1)
                cls_loss = F.binary_cross_entropy_with_logits(
                    per_frame_logits.double(), labels.double())
                #cls_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
                #cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
                #tot_cls_loss += cls_loss.data[0]
                tot_cls_loss += cls_loss.item()
                cls_loss.backward()
                print('{} Loss: {:.4f} and lr: {}'.format(
                    phase, tot_cls_loss / num_iter, init_lr))
                with open('i3d_video.txt', 'a') as file:
                    file.write("%f\n" % (tot_cls_loss / num_iter))
                optimizer.step()
                optimizer.zero_grad()
                lr_sched.step()
                #print(tot_cls_loss)
                #loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
                #tot_loss += loss.data[0]
                #loss.backward()


#                if num_iter == num_steps_per_update and phase == 'train':
#if num_iter == num_steps_per_update:
#steps += 1
#   count = count + num_steps_per_update
#   num_iter = 0
#   optimizer.step()
#   optimizer.zero_grad()
#   lr_sched.step()
#   print('{} Loss: {:.4f} and lr: {}'.format(phase,tot_cls_loss/count,init_lr))
#   with open('i3d_video.txt', 'a') as file:
#       file.write("%f\n" %(tot_cls_loss/count))
#            if phase == 'val':
#                print('Final {} Loss: {:.4f}'.format(phase,tot_cls_loss/steps))
            torch.save(i3d.module.state_dict(),
                       save_model + phase + str(steps).zfill(6) + '.pt')
            tot_cls_loss = 0.
        steps += 1