Example #1
0
def extract_attn_feats(model,
                       DATASET,
                       LABELS,
                       CLASS_IDS,
                       BATCH_SIZE,
                       SEQ_SIZE=16,
                       STEP=16,
                       partition='train',
                       nstrokes=-1,
                       base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    encoder, decoder : attn_model.Encoder 
        relative path to the checkpoint file for Autoencoder
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    base_name : str
        path containing the pickled feature dumps
    
    Returns:
    --------
    features_dictionary, stroke_names
    
    '''

    ###########################################################################
    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    #####################################################################

    if partition == 'train':
        partition_lst = train_lst
    elif partition == 'val':
        partition_lst = val_lst
    elif partition == 'test':
        partition_lst = test_lst
    else:
        print("Partition should be : train / val / test")
        return

    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        #                                         videotransforms.RandomCrop(112),
        videotransforms.ToTensor(),
        #                                         videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    part_dataset = CricketStrokesDataset(partition_lst,
                                         DATASET,
                                         LABELS,
                                         CLASS_IDS,
                                         frames_per_clip=SEQ_SIZE,
                                         step_between_clips=STEP,
                                         train=False,
                                         framewiseTransform=False,
                                         transform=clip_transform)

    data_loader = DataLoader(dataset=part_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=False)

    ###########################################################################
    # Validate / Evaluate
    model.eval()
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0
    prev_stroke = None
    print("Total Batches : {} :: BATCH_SIZE : {}".format(
        data_loader.__len__(), BATCH_SIZE))
    ###########################################################################
    for bno, (inputs, vid_path, stroke, labels) in enumerate(data_loader):
        # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM
        inputs = inputs.permute(0, 2, 1, 3, 4).float()
        inputs = inputs.to(device)
        #        print("Batch No : {} / {}".format(bno, len(data_loader)))
        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):

            recon_x, mu, logvar = model(inputs)

#            dec_out_lst = []
#            dec_out_lst.append(out_mu)
#
#            outputs = torch.stack(dec_out_lst, dim=1)

# convert to start frames and end frames from tensors to lists
        stroke = [s.tolist() for s in stroke]
        # outputs are the reconstructed features. Use compressed enc_out values(maybe wtd.).
        inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(mu, \
                                                                    vid_path, stroke)

        # for sequence of features from batch segregated extracted features.
        if bno == 0:
            prev_stroke = batch_stroke_names[0]

        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            nSeqs = enc_input.size(0)
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []

#            enc_output = model.encoder(enc_input.to(device))
#            enc_output = enc_output.squeeze(axis=1).cpu().data.numpy()
            enc_output = enc_input.cpu().data.numpy()

            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend(
                [enc_output[i, :] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]

        if nstrokes > -1 and num_strokes >= nstrokes:
            break

    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])

    # convert to dictionary of features with keys as stroke names(with ext).
    features = {}
    for i, t in enumerate(trajectories):
        features[stroke_names[i]] = np.array(t)


#    trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names)

    return features, stroke_names
Example #2
0
def main(DATASET,
         LABELS,
         CLASS_IDS,
         BATCH_SIZE,
         ANNOTATION_FILE,
         SEQ_SIZE=16,
         STEP=16,
         nstrokes=-1,
         N_EPOCHS=25,
         base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    ###########################################################################
    # seed everything

    if not os.path.isdir(base_name):
        os.makedirs(base_name)

    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        #                                         videotransforms.RandomCrop(112),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    # or use CricketStrokesFlowDataset
    train_dataset = CricketStrokesDataset(train_lst,
                                          DATASET,
                                          LABELS,
                                          CLASS_IDS,
                                          frames_per_clip=SEQ_SIZE,
                                          step_between_clips=STEP,
                                          train=True,
                                          framewiseTransform=False,
                                          transform=clip_transform)
    # or use CricketStrokesFlowDataset
    val_dataset = CricketStrokesDataset(val_lst,
                                        DATASET,
                                        LABELS,
                                        CLASS_IDS,
                                        frames_per_clip=SEQ_SIZE,
                                        step_between_clips=STEP,
                                        train=False,
                                        framewiseTransform=False,
                                        transform=clip_transform)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False)

    data_loaders = {"train": train_loader, "test": val_loader}

    ###########################################################################

    labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE)

    num_classes = len(list(set(labs_values)))

    ###########################################################################
    # load model and set loss function
    model = conv_encdec_model.ConvVAE()

    model = model.to(device)
    #    # load checkpoint:

    # Setup the loss fxn
    criterion = nn.MSELoss()

    #    # Layers to finetune. Last layer should be displayed
    print("Params to learn:")
    params_to_update = []
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("ConvVAE : {}".format(name))

    # Observe that all parameters are being optimized


#    optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001)
    optimizer_ft = torch.optim.SGD(params_to_update, lr=0.01, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    lr_scheduler = StepLR(optimizer_ft, step_size=10, gamma=0.1)

    #    # Observe that all parameters are being optimized
    #    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    ###########################################################################
    # Training the model
    start = time.time()

    model = train_model(model,
                        data_loaders,
                        criterion,
                        optimizer_ft,
                        lr_scheduler,
                        labs_keys,
                        labs_values,
                        seq=8,
                        num_epochs=N_EPOCHS)

    end = time.time()

    print("Total Execution time for {} epoch : {}".format(
        N_EPOCHS, (end - start)))
    ###########################################################################
    # Save only the model params
    model_name = os.path.join(base_name,
                              "conv_vae_ep" + str(N_EPOCHS) + "_SGD.pt")

    #    torch.save(model.state_dict(), model_name)
    #    print("Model saved to disk... : {}".format(model_name))    # Load model checkpoints

    # Loading the saved model
    model_name = os.path.join(base_name,
                              "conv_vae_ep" + str(N_EPOCHS) + "_SGD.pt")
    if os.path.isfile(model_name):
        model.load_state_dict(torch.load(model_name))
        print("Loading ConvVAE weights... : {}".format(model_name))

    ###########################################################################

    print("Writing prediction dictionary....")
    #    pred_out_dict = predict(encoder, decoder, data_loaders, criterion, labs_keys,
    #                            labs_values, phase='test')
    if not os.path.isfile(os.path.join(base_name, "conv_vae_train.pkl")):
        if not os.path.exists(base_name):
            os.makedirs(base_name)
        feats_dict, stroke_names = extract_attn_feats(model, DATASET, LABELS,
                                                      CLASS_IDS, BATCH_SIZE,
                                                      SEQ_SIZE, 16, 'train',
                                                      -1, base_name)
        with open(os.path.join(base_name, "conv_vae_train.pkl"), "wb") as fp:
            pickle.dump(feats_dict, fp)
        with open(os.path.join(base_name, "conv_vae_snames_train.pkl"),
                  "wb") as fp:
            pickle.dump(stroke_names, fp)
    if not os.path.isfile(os.path.join(base_name, "conv_vae_val.pkl")):
        if not os.path.exists(base_name):
            os.makedirs(base_name)
        feats_dict, stroke_names = extract_attn_feats(model, DATASET, LABELS,
                                                      CLASS_IDS, BATCH_SIZE,
                                                      SEQ_SIZE, 16, 'val', -1,
                                                      base_name)
        with open(os.path.join(base_name, "conv_vae_val.pkl"), "wb") as fp:
            pickle.dump(feats_dict, fp)
        with open(os.path.join(base_name, "conv_vae_snames_val.pkl"),
                  "wb") as fp:
            pickle.dump(stroke_names, fp)
    if not os.path.isfile(os.path.join(base_name, "conv_vae_test.pkl")):
        if not os.path.exists(base_name):
            os.makedirs(base_name)
        feats_dict, stroke_names = extract_attn_feats(model, DATASET, LABELS,
                                                      CLASS_IDS, BATCH_SIZE,
                                                      SEQ_SIZE, 16, 'test', -1,
                                                      base_name)
        with open(os.path.join(base_name, "conv_vae_test.pkl"), "wb") as fp:
            pickle.dump(feats_dict, fp)
        with open(os.path.join(base_name, "conv_vae_snames_test.pkl"),
                  "wb") as fp:
            pickle.dump(stroke_names, fp)

    print("#Parameters ConvVAE : {} ".format(
        autoenc_utils.count_parameters(model)))

    return model
Example #3
0
def finetune_3DCNN(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, ANNOTATION_FILE, 
                   SEQ_SIZE=16, STEP=16, nstrokes=-1, N_EPOCHS=25, base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    
    ###########################################################################
    # Read the strokes 
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))
        
    ###########################################################################
    # Create a Dataset    
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([videotransforms.CenterCrop(224),
                                         videotransforms.ToPILClip(), 
                                         videotransforms.Resize((112, 112)),
#                                         videotransforms.RandomCrop(112), 
                                         videotransforms.ToTensor(), 
                                         videotransforms.Normalize(),
                                        #videotransforms.RandomHorizontalFlip(),\
                                        ])

    train_dataset = CricketStrokesDataset(train_lst, DATASET, LABELS, CLASS_IDS, 
                                         frames_per_clip=SEQ_SIZE, 
                                         step_between_clips=STEP, train=True, 
                                         framewiseTransform=False,
                                         transform=clip_transform)
    val_dataset = CricketStrokesDataset(val_lst, DATASET, LABELS, CLASS_IDS, 
                                         frames_per_clip=SEQ_SIZE, 
                                         step_between_clips=STEP, train=False, 
                                         framewiseTransform=False,
                                         transform=clip_transform)
    
    train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    data_loaders = {"train": train_loader, "test": val_loader}

    ###########################################################################
    
    labs_keys, labs_values = get_cluster_labels(ANNOTATION_FILE)
    
    num_classes = len(list(set(labs_values)))
    
    ###########################################################################    
    # load model and set loss function
    model = torchvision.models.video.r3d_18(pretrained=True, progress=True)
    
    for ft in model.parameters():
        ft.requires_grad = False
    
    inp_feat_size = model.fc.in_features
    model.fc = nn.Linear(inp_feat_size, num_classes)
    model = model.to(device)
    
    # load checkpoint:
    if os.path.isfile(os.path.join(base_name, "3dresnet18_ep"+str(N_EPOCHS)+"_Adam.pt")):
        model.load_state_dict(torch.load(os.path.join(base_name, "3dresnet18_ep"+str(N_EPOCHS)+"_Adam.pt")))
    
    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()
    
    # Layers to finetune. Last layer should be displayed
    params_to_update = model.parameters()
    print("Params to learn:")
    
    params_to_update = []
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
    
    # Observe that all parameters are being optimized
    optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001)
    
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = StepLR(optimizer_ft, step_size=8, gamma=0.1)
    
#    # Observe that all parameters are being optimized
#    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
    
#    ###########################################################################
#    # Training the model    
#    
#    start = time.time()
#    
#    model_ft = train_model(model, data_loaders, criterion, optimizer_ft, 
#                           exp_lr_scheduler, labs_keys, labs_values,
#                           num_epochs=N_EPOCHS)
#        
#    end = time.time()
#    
#    # save the best performing model
#    save_model_checkpoint(base_name, model_ft, N_EPOCHS, "Adam")
#    
#    print("Total Execution time for {} epoch : {}".format(N_EPOCHS, (end-start)))
    
    ###########################################################################
    # Validate / Evaluate
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0
    model = model.eval()
#    extractor = Clip2Vec()
    #INPUT_SIZE = extractor.layer_output_size
    prev_stroke = None
    
    print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loaders['test'].__len__(), BATCH_SIZE))
    assert SEQ_SIZE>=16, "SEQ_SIZE should be >= 16"
    for bno, (inputs, vid_path, stroke, _) in enumerate(data_loaders['test']):
        # get video clips (B, SL, C, H, W)
        print("Batch No : {}".format(bno))
        # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16)
        inputs = inputs.permute(0, 2, 1, 3, 4).float()
        inputs = extractor.get_vec(inputs)
        
        # convert to start frames and end frames from tensors to lists
        stroke = [s.tolist() for s in stroke]
        inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \
                                                                    vid_path, stroke)
        
        if bno == 0:
            prev_stroke = batch_stroke_names[0]
        
        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            nSeqs = enc_input.size(0)
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []
            
            enc_output = enc_input
            enc_output = enc_output.squeeze(axis=1).cpu().data.numpy()
            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]
            
                
        if nstrokes >=-1 and num_strokes == nstrokes:
            break
       
    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])
        
    trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names)
    #stroke_vecs, stroke_names =  aggregate_outputs(sequence_outputs, seq_stroke_names)
    return trajectories, stroke_names
def extract_3DCNN_feats(DATASET, LABELS, CLASS_IDS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \
                        model_path=None, nclasses=5, partition='all', nstrokes=-1):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    
    ###########################################################################
    # Read the strokes 
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))
    
    #####################################################################
    
    if partition == 'all':
        partition_lst = train_lst
        partition_lst.extend(val_lst)
        partition_lst.extend(test_lst)
    elif partition == 'train':
        partition_lst = train_lst
    elif partition == 'val':
        partition_lst = val_lst
    elif partition == 'test':
        partition_lst = test_lst
        
    ###########################################################################
    # Create a Dataset    
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([videotransforms.CenterCrop(224),
                                         videotransforms.ToPILClip(), 
                                         videotransforms.Resize((112, 112)),
#                                         videotransforms.RandomCrop(112), 
                                         videotransforms.ToTensor(), 
                                         videotransforms.Normalize(),
                                        #videotransforms.RandomHorizontalFlip(),\
                                        ])

    part_dataset = CricketStrokesDataset(partition_lst, DATASET, LABELS, CLASS_IDS, 
                                         frames_per_clip=SEQ_SIZE, 
                                         step_between_clips=STEP, train=True, 
                                         framewiseTransform=False,
                                         transform=clip_transform)
    
    data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False)

    ###########################################################################
    # Validate / Evaluate
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0
    extractor = Clip2Vec(model_path, nclasses)
    #INPUT_SIZE = extractor.layer_output_size
    prev_stroke = None
    
    print("Total Batches : {} :: BATCH_SIZE : {}".format(data_loader.__len__(), BATCH_SIZE))
    assert SEQ_SIZE>=16, "SEQ_SIZE should be >= 16"
    for bno, (inputs, vid_path, stroke, _) in enumerate(data_loader):
        # get video clips (B, SL, C, H, W)
        print("Batch No : {}".format(bno))
        # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16)
        inputs = inputs.permute(0, 2, 1, 3, 4).float()
        inputs = extractor.get_vec(inputs)
        
        # convert to start frames and end frames from tensors to lists
        stroke = [s.tolist() for s in stroke]
        inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(inputs, \
                                                                    vid_path, stroke)
        
        if bno == 0:
            prev_stroke = batch_stroke_names[0]
        
        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            nSeqs = enc_input.size(0)
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []
            
            enc_output = enc_input
            enc_output = enc_output.squeeze(axis=1).cpu().data.numpy()
            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend([enc_output[i, :] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]
            
                
        if nstrokes >=-1 and num_strokes == nstrokes:
            break
       
    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])
        
    trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names)
    #stroke_vecs, stroke_names =  aggregate_outputs(sequence_outputs, seq_stroke_names)
    return trajectories, stroke_names
def main(DATASET,
         LABELS,
         CLASS_IDS,
         BATCH_SIZE,
         ANNOTATION_FILE,
         SEQ_SIZE=16,
         STEP=16,
         nstrokes=-1,
         N_EPOCHS=25,
         base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    ###########################################################################
    # seed everything
    seed = 1234
    attn_utils.seed_everything(seed)
    if not os.path.isdir(base_name):
        os.makedirs(base_name)

    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    train_transform = transforms.Compose([
        videotransforms.RandomCrop(224),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    test_transform = transforms.Compose([
        videotransforms.CenterCrop(224),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #videotransforms.RandomHorizontalFlip(),\
    ])
    train_dataset = CricketStrokesDataset(train_lst,
                                          DATASET,
                                          LABELS,
                                          CLASS_IDS,
                                          frames_per_clip=SEQ_SIZE,
                                          step_between_clips=STEP,
                                          train=True,
                                          framewiseTransform=False,
                                          transform=train_transform)
    val_dataset = CricketStrokesDataset(val_lst,
                                        DATASET,
                                        LABELS,
                                        CLASS_IDS,
                                        frames_per_clip=SEQ_SIZE,
                                        step_between_clips=STEP,
                                        train=False,
                                        framewiseTransform=False,
                                        transform=test_transform)

    ###########################################################################

    labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE)

    num_classes = len(list(set(labs_values)))

    # created weighted Sampler for class imbalance
    if not os.path.isfile(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl")):
        samples_weight = attn_utils.get_sample_weights(train_dataset,
                                                       labs_keys, labs_values,
                                                       train_lst)
        with open(
                os.path.join(
                    base_name, "weights_c" + str(num_classes) + "_" +
                    str(len(train_dataset)) + ".pkl"), "wb") as fp:
            pickle.dump(samples_weight, fp)
    with open(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl"), "rb") as fp:
        samples_weight = pickle.load(fp)
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              sampler=sampler,
                              worker_init_fn=np.random.seed(12))

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False)

    data_loaders = {"train": train_loader, "test": val_loader}

    ###########################################################################
    # load model and set loss function
    encoder = conv_attn_model.Conv3DEncoder(HIDDEN_SIZE, 1, bidirectional)
    #    encoder = conv_attn_model.Conv3DAttention(HIDDEN_SIZE, num_classes, 1, 196, bidirectional)
    decoder = conv_attn_model.Conv3DDecoder(HIDDEN_SIZE, num_classes, 1, 1,
                                            bidirectional)
    #    decoder = conv_encdec_model.Conv3DDecoder(HIDDEN_SIZE, HIDDEN_SIZE, 1, 196, bidirectional)
    #    model = attn_model.Encoder(10, 20, bidirectional)

    #    for ft in model.parameters():
    #        ft.requires_grad = False
    #    inp_feat_size = model.fc.in_features
    #    model.fc = nn.Linear(inp_feat_size, num_classes)
    #    model = model.to(device)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    #    # load checkpoint:

    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()
    #    criterion = nn.MSELoss()

    #    # Layers to finetune. Last layer should be displayed
    print("Params to learn:")
    params_to_update = []
    for name, param in encoder.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("Encoder : {}".format(name))
    for name, param in decoder.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("Decoder : {}".format(name))

    # Observe that all parameters are being optimized


#    optimizer_ft = torch.optim.Adam(params_to_update, lr=0.001)
#    optimizer_ft = torch.optim.SGD(params_to_update, lr=0.01, momentum=0.9)
    encoder_optimizer = torch.optim.SGD(encoder.parameters(),
                                        lr=0.01,
                                        momentum=0.9)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(),
                                        lr=0.01,
                                        momentum=0.9)
    #    decoder_optimizer = None

    # Decay LR by a factor of 0.1 every 7 epochs
    lr_scheduler = StepLR(encoder_optimizer, step_size=10, gamma=0.1)

    #    # Observe that all parameters are being optimized
    #    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    #    ###########################################################################
    # Training the model
    start = time.time()

    (encoder, decoder) = train_model(encoder,
                                     decoder,
                                     data_loaders,
                                     criterion,
                                     encoder_optimizer,
                                     decoder_optimizer,
                                     lr_scheduler,
                                     labs_keys,
                                     labs_values,
                                     num_epochs=N_EPOCHS)

    end = time.time()

    # save the best performing model
    attn_utils.save_attn_model_checkpoint(base_name, (encoder, decoder),
                                          N_EPOCHS, "SGD")
    # Load model checkpoints
    encoder, decoder = attn_utils.load_attn_model_checkpoint(
        base_name, encoder, decoder, N_EPOCHS, "SGD")

    print("Total Execution time for {} epoch : {}".format(
        N_EPOCHS, (end - start)))

    ###########################################################################

    #    features_val, stroke_names_id_val = attn_utils.read_feats(os.path.join(base_name, ft_dir),
    #                                                              feat_val, snames_val)
    print("Writing prediction dictionary....")
    pred_out_dict, acc = predict(encoder,
                                 decoder,
                                 data_loaders,
                                 criterion,
                                 labs_keys,
                                 labs_values,
                                 phase='test')

    with open(os.path.join(base_name, "pred_dict.pkl"), "wb") as fp:
        pickle.dump(pred_out_dict, fp)

    # save the output wts and related information
    print("#Parameters Encoder : {} ".format(
        autoenc_utils.count_parameters(encoder)))
    print("#Parameters Decoder : {} ".format(
        autoenc_utils.count_parameters(decoder)))

    return encoder, decoder
Example #6
0
def main(DATASET,
         LABELS,
         CLASS_IDS,
         BATCH_SIZE,
         ANNOTATION_FILE,
         SEQ_SIZE=16,
         STEP=16,
         nstrokes=-1,
         N_EPOCHS=25,
         base_name=""):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    if not os.path.isdir(base_name):
        os.makedirs(base_name)
    seed = 1234
    attn_utils.seed_everything(seed)
    ###########################################################################
    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    train_transforms = transforms.Compose([
        videotransforms.RandomCrop(300),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #                                           videotransforms.ScaledNormMinMax(),
    ])
    test_transforms = transforms.Compose([
        videotransforms.CenterCrop(300),
        videotransforms.ToPILClip(),
        videotransforms.Resize((112, 112)),
        videotransforms.ToTensor(),
        videotransforms.Normalize(),
        #                                          videotransforms.ScaledNormMinMax(),
    ])
    train_dataset = CricketStrokesDataset(train_lst,
                                          DATASET,
                                          LABELS,
                                          CLASS_IDS,
                                          frames_per_clip=SEQ_SIZE,
                                          step_between_clips=STEP,
                                          train=True,
                                          framewiseTransform=False,
                                          transform=train_transforms)
    val_dataset = CricketStrokesDataset(val_lst,
                                        DATASET,
                                        LABELS,
                                        CLASS_IDS,
                                        frames_per_clip=SEQ_SIZE,
                                        step_between_clips=STEP,
                                        train=False,
                                        framewiseTransform=False,
                                        transform=test_transforms)

    ###########################################################################

    labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE)

    num_classes = len(list(set(labs_values)))

    # created weighted Sampler for class imbalance
    if not os.path.isfile(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl")):
        samples_weight = attn_utils.get_sample_weights(train_dataset,
                                                       labs_keys, labs_values,
                                                       train_lst)
        with open(
                os.path.join(
                    base_name, "weights_c" + str(num_classes) + "_" +
                    str(len(train_dataset)) + ".pkl"), "wb") as fp:
            pickle.dump(samples_weight, fp)
    with open(
            os.path.join(
                base_name, "weights_c" + str(num_classes) + "_" +
                str(len(train_dataset)) + ".pkl"), "rb") as fp:
        samples_weight = pickle.load(fp)
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              sampler=sampler,
                              worker_init_fn=np.random.seed(12))

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False)

    data_loaders = {"train": train_loader, "test": val_loader}

    ###########################################################################
    # load model and set loss function
    model = conv_attn_model.C3DGRUv2Orig(HIDDEN_SIZE, 1, num_classes,
                                         bidirectional)
    model_pretrained = c3d.C3D()
    model_pretrained.load_state_dict(
        torch.load("../localization_rnn/" + wts_path))
    #    model_pretrained = c3d_pre.C3D()
    #    model_pretrained.fc8 = nn.Linear(4096, 5)
    #    model_pretrained.load_state_dict(torch.load(pretrained_c3d_wts))
    copy_pretrained_weights(model_pretrained, model)
    # reset the last layer (default requires_grad is True)
    #    model.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
    #    for ft in model.parameters():
    #        ft.requires_grad = False
    #    inp_feat_size = model.fc.in_features
    #    model.fc = nn.Linear(inp_feat_size, num_classes)
    model = model.to(device)

    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()
    #    criterion = nn.MSELoss()

    #    # Layers to finetune. Last layer should be displayed
    print("Params to learn:")
    params_to_update = []
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t {}".format(name))

    # Observe that all parameters are being optimized


#    optimizer_ft = torch.optim.Adam(params_to_update, lr=0.01)
    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    lr_scheduler = StepLR(optimizer_ft, step_size=30, gamma=0.1)

    ###########################################################################
    # Training the model
    start = time.time()

    model = train_model(model,
                        data_loaders,
                        criterion,
                        optimizer_ft,
                        lr_scheduler,
                        labs_keys,
                        labs_values,
                        num_epochs=N_EPOCHS)

    end = time.time()

    # save the best performing model
    attn_utils.save_model_checkpoint(base_name, model, N_EPOCHS,
                                     "SGD_c8_c3dgruEp60Step30")
    # Load model checkpoints
    model = attn_utils.load_weights(base_name, model, N_EPOCHS,
                                    "SGD_c8_c3dgruEp60Step30")

    print("Total Execution time for {} epoch : {}".format(
        N_EPOCHS, (end - start)))

    #    ###########################################################################

    print("Predicting ...")
    acc = predict(model, data_loaders, labs_keys, labs_values, phase='test')

    print("#Parameters : {} ".format(autoenc_utils.count_parameters(model)))

    return model
Example #7
0
def main(DATASET,
         LABELS,
         CLASS_IDS,
         BATCH_SIZE,
         ANNOTATION_FILE,
         SEQ_SIZE=16,
         STEP=16,
         nstrokes=-1,
         N_EPOCHS=25,
         base_name=''):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''
    ###########################################################################

    attn_utils.seed_everything(1234)

    if not os.path.isdir(log_path):
        os.makedirs(log_path)

    # Read the strokes
    # Divide the highlight dataset files into training, validation and test sets
    train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET)
    print("No. of training videos : {}".format(len(train_lst)))

    #    features, stroke_names_id = attn_utils.read_feats(feat_path, feat, snames)

    #    ###########################################################################
    #
    #    features_val, stroke_names_id_val = attn_utils.read_feats(feat_path, feat_val,
    #                                                              snames_val)
    ###########################################################################
    # Create a Dataset
    train_transforms = transforms.Compose([
        T.CenterCrop(300),
        T.ToPILClip(),
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(),
    ])
    test_transforms = transforms.Compose([
        T.CenterCrop(300),
        T.ToPILClip(),
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(),
    ])

    #    ft_path = os.path.join(base_name, feat_path, feat)
    train_dataset = CricketStrokeClipsDataset(train_lst,
                                              DATASET,
                                              LABELS,
                                              CLASS_IDS,
                                              frames_per_clip=SEQ_SIZE,
                                              extracted_frames_per_clip=16,
                                              step_between_clips=STEP,
                                              train=True,
                                              framewiseTransform=False,
                                              transform=train_transforms)
    #    ft_path_val = os.path.join(base_name, feat_path, feat_val)
    val_dataset = CricketStrokeClipsDataset(val_lst,
                                            DATASET,
                                            LABELS,
                                            CLASS_IDS,
                                            frames_per_clip=SEQ_SIZE,
                                            extracted_frames_per_clip=16,
                                            step_between_clips=STEP,
                                            train=False,
                                            framewiseTransform=False,
                                            transform=test_transforms)

    # get labels
    labs_keys, labs_values = attn_utils.get_cluster_labels(ANNOTATION_FILE)
    #    # created weighted Sampler for class imbalance
    #    samples_weight = attn_utils.get_sample_weights(train_dataset, labs_keys, labs_values,
    #                                                   train_lst)
    #    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
    #                              sampler=sampler, worker_init_fn=np.random.seed(12))

    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=False)

    data_loaders = {"train": train_loader, "test": val_loader}

    num_classes = len(list(set(labs_values)))

    ###########################################################################

    # load model and set loss function
    model = siamese_net.SiameseI3DNet(400, in_channels=3)
    model.i3d.load_state_dict(
        torch.load(
            '/home/arpan/VisionWorkspace/pytorch-i3d/models/rgb_imagenet.pt'))
    #    model.i3d.replace_logits(2)
    #    model = load_weights(log_path, model, N_EPOCHS,
    #                                    "S"+str(SEQ_SIZE)+"C"+str(cluster_size)+"_SGD")

    lr = 0.1
    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0000001)
    #    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 25]) # [300, 1000])
    # Decay LR by a factor of 0.1 every 7 epochs
    lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    # Setup the loss fxn
    criterion = ContrastiveLoss()
    model = model.to(device)
    #    print("Params to learn:")
    params_to_update = []
    for name, param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t", name)


#    # Observe that all parameters are being optimized
##    optimizer_ft = torch.optim.Adam(model.parameters(), lr=0.001)
#    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
#
#    # Decay LR by a factor of 0.1 every 7 epochs
#    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

#    lr = 5.0 # learning rate
#    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
#    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
###########################################################################
# Training the model

    start = time.time()

    model = train_model(model,
                        data_loaders,
                        criterion,
                        optimizer,
                        lr_sched,
                        labs_keys,
                        labs_values,
                        num_epochs=N_EPOCHS)

    end = time.time()

    #    # save the best performing model
    save_model_checkpoint(log_path, model, N_EPOCHS,
                          "S" + str(SEQ_SIZE) + "_SGD")
    # Load model checkpoints
    model = load_weights(log_path, model, N_EPOCHS,
                         "S" + str(SEQ_SIZE) + "_SGD")

    print("Total Execution time for {} epoch : {}".format(
        N_EPOCHS, (end - start)))

    #    ###########################################################################

    #    acc = predict(features_val, stroke_names_id_val, model, data_loaders, labs_keys,
    #                  labs_values, SEQ_SIZE, phase='test')

    ###########################################################################

    #    # Extract attention model features
    #    if not os.path.isfile(os.path.join(log_path, "siamgru_feats.pkl")):
    #        if not os.path.exists(log_path):
    #            os.makedirs(log_path)
    #        #    # Extract Grid OF / HOOF features {mth = 2, and vary nbins}
    #        print("Training extraction ... ")
    #        feats_dict, stroke_names = extract_trans_feats(model, DATASET, LABELS,
    #                                                      CLASS_IDS, BATCH_SIZE, SEQ_SIZE,
    #                                                      SEQ_SIZE-1, partition='train', nstrokes=nstrokes,
    #                                                      base_name=log_path)
    #
    #        with open(os.path.join(log_path, "siamgru_feats.pkl"), "wb") as fp:
    #            pickle.dump(feats_dict, fp)
    #        with open(os.path.join(log_path, "siamgru_snames.pkl"), "wb") as fp:
    #            pickle.dump(stroke_names, fp)
    #
    #    if not os.path.isfile(os.path.join(log_path, "siamgru_feats_val.pkl")):
    #        print("Validation extraction ....")
    #        feats_dict_val, stroke_names_val = extract_trans_feats(model, DATASET, LABELS,
    #                                                      CLASS_IDS, BATCH_SIZE, SEQ_SIZE,
    #                                                      SEQ_SIZE-1, partition='val', nstrokes=nstrokes,
    #                                                      base_name=log_path)
    #
    #        with open(os.path.join(log_path, "siamgru_feats_val.pkl"), "wb") as fp:
    #            pickle.dump(feats_dict_val, fp)
    #        with open(os.path.join(log_path, "siamgru_snames_val.pkl"), "wb") as fp:
    #            pickle.dump(stroke_names_val, fp)
    #
    #    if not os.path.isfile(os.path.join(log_path, "siamgru_feats_test.pkl")):
    #        print("Testing extraction ....")
    #        feats_dict_val, stroke_names_val = extract_trans_feats(model, DATASET, LABELS,
    #                                                      CLASS_IDS, BATCH_SIZE, SEQ_SIZE,
    #                                                      SEQ_SIZE-1, partition='test', nstrokes=nstrokes,
    #                                                      base_name=log_path)
    #
    #        with open(os.path.join(log_path, "siamgru_feats_test.pkl"), "wb") as fp:
    #            pickle.dump(feats_dict_val, fp)
    #        with open(os.path.join(log_path, "siamgru_snames_test.pkl"), "wb") as fp:
    #            pickle.dump(stroke_names_val, fp)

    # call count_paramters(model)  for displaying total no. of parameters
    print("#Parameters : {} ".format(autoenc_utils.count_parameters(model)))
    return 0
Example #8
0
def extract_3DCNN_feats(DATASET, LABELS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, \
                        foldno=1, train=True, nclasses=51, model_path=None, \
                        nstrokes=-1):
    '''
    Extract sequence features from AutoEncoder.
    
    Parameters:
    -----------
    DATASET : str
        path to the video dataset
    LABELS : str
        path containing stroke labels
    CLASS_IDS : str
        path to txt file defining classes, similar to THUMOS
    BATCH_SIZE : int
        size for batch of clips
    SEQ_SIZE : int
        no. of frames in a clip (min. 16 for 3D CNN extraction)
    STEP : int
        stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ...
    partition : str
        'all' / 'train' / 'test' / 'val' : Videos to be considered
    nstrokes : int
        partial extraction of features (do not execute for entire dataset)
    
    Returns:
    --------
    trajectories, stroke_names
    
    '''

    ###########################################################################
    ###########################################################################
    # Create a Dataset
    # Clip level transform. Use this with framewiseTransform flag turned off
    clip_transform = transforms.Compose([
        T.CenterCrop(224),
        T.ToPILClip(),
        T.Resize((112, 112)),
        #                                         T.RandomCrop(112),
        T.ToHMDBTensor(),
        #                                         T.Normalize(),
        #T.RandomHorizontalFlip(),\
    ])
    part_dataset = hmdb.HMDB51(DATASET,
                               LABELS,
                               SEQ_SIZE,
                               step_between_clips=STEP,
                               fold=foldno,
                               train=train,
                               transform=clip_transform)

    data_loader = DataLoader(dataset=part_dataset,
                             batch_size=BATCH_SIZE,
                             shuffle=False)

    ###########################################################################
    # Validate / Evaluate
    stroke_names = []
    trajectories, stroke_traj = [], []
    num_strokes = 0
    extractor = Clip2Vec(model_path, nclasses)
    #INPUT_SIZE = extractor.layer_output_size
    prev_stroke = None

    print("Total Batches : {} :: BATCH_SIZE : {}".format(
        data_loader.__len__(), BATCH_SIZE))
    assert SEQ_SIZE >= 16, "SEQ_SIZE should be >= 16"
    for bno, (inputs, vid_path, start_pts, end_pts,
              _) in enumerate(data_loader):
        # get video clips (B, SL, C, H, W)
        print("Batch No : {}".format(bno))
        # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16)
        inputs = inputs.permute(0, 2, 1, 3, 4).float()
        inputs = extractor.get_vec(inputs)

        # convert to start frames and end frames from tensors to lists
        inputs_lst, batch_stroke_names = separate_video_tensors(
            inputs, vid_path)

        if bno == 0:
            prev_stroke = batch_stroke_names[0]

        for enc_idx, enc_input in enumerate(inputs_lst):
            # get no of sequences that can be extracted from enc_input tensor
            if prev_stroke != batch_stroke_names[enc_idx]:
                # append old stroke to trajectories
                if len(stroke_traj) > 0:
                    num_strokes += 1
                    trajectories.append(stroke_traj)
                    stroke_names.append(prev_stroke)
                    stroke_traj = []

            enc_output = enc_input
            enc_output = enc_output.squeeze(axis=1).cpu().data.numpy()
            # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]]
            stroke_traj.extend(
                [enc_output[i, :] for i in range(enc_output.shape[0])])
            prev_stroke = batch_stroke_names[enc_idx]

        if nstrokes >= -1 and num_strokes == nstrokes:
            break

    # for last batch only if extracted for full dataset
    if len(stroke_traj) > 0 and nstrokes < 0:
        trajectories.append(stroke_traj)
        stroke_names.append(batch_stroke_names[-1])

    # group_strokewise not needed here as videos are trimmed. Was needed for strokes
    # to generate a lists of stroke feature lists for an untrimmed video.


#    trajectories, stroke_names = group_strokewise(trajectories, stroke_names)

    traj_dict = {}
    for i, vid in enumerate(stroke_names):
        traj_dict[vid] = np.array(trajectories[i])
    return traj_dict, stroke_names
Example #9
0
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='', batch_size=32, save_model='i3dIter1k_'):
    
    num_epochs = 30
    seed_everything()
    if not os.path.isdir(log_path):
        os.makedirs(log_path)
    
    # setup dataset
    train_transforms = transforms.Compose([T.RandomCrop(224),
                                         T.ToPILClip(), 
                                         T.Resize((224, 224)),
#                                         T.RandomCrop(112), 
                                         T.ToTensor(), 
                                         T.Normalize(),
                                        #T.RandomHorizontalFlip(),\
                                        ])
    test_transforms = transforms.Compose([T.CenterCrop(224),
                                         T.ToPILClip(), 
                                         T.Resize((224, 224)),
#                                         T.RandomCrop(112), 
                                         T.ToTensor(), 
                                         T.Normalize(),
                                        #T.RandomHorizontalFlip(),\
                                        ])    
#    train_transforms = transforms.Compose([T.RandomCrop(224),
#                                           T.RandomHorizontalFlip(),
#    ])
#    test_transforms = transforms.Compose([T.CenterCrop(224)])

    dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, 
                     fold=1, train=True, transform=train_transforms)
#    samples_weight = get_hmdb_sample_weights(dataset)
#    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
#    dataset = Dataset(train_split, 'training', root, mode, train_transforms)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)#sampler=sampler, worker_init_fn=np.random.seed(12)) #shuffle=True) #, num_workers=36, pin_memory=True)
    val_dataset = HMDB51(DATASET, LABELS, 16, step_between_clips = 1, 
                     fold=1, train=False, transform=test_transforms)
#    val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) #, num_workers=36, pin_memory=True)    

    dataloaders = {'train': dataloader, 'test': val_dataloader}
    datasets = {'train': dataset, 'test': val_dataset}

#    vis_samples(dataset, True)
    
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
        i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/flow_imagenet.pt'))
    else:
        i3d = InceptionI3d(400, in_channels=3)
        i3d.load_state_dict(torch.load('/home/arpan/VisionWorkspace/pytorch-i3d/models/rgb_imagenet.pt'))
    i3d.replace_logits(51)
    #i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
    i3d = i3d.to(device)
#    i3d = nn.DataParallel(i3d)

    lr = init_lr
    optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
#    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 25]) # [300, 1000])
    # Decay LR by a factor of 0.1 every 7 epochs
    lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
#    criterion = nn.CrossEntropyLoss()
    num_steps_per_update = 4 # accum gradient
    steps = 0
    # train it
    start = time.time()
#    print("No. of Iters / Epoch ; {}".format(len(dataloaders['train'])))
#    for epoch in range(num_epochs): #while steps < max_steps:
##        print( 'Step {}/{}'.format(steps, max_steps))
#        print('Epoch {}/{}'.format(epoch+1, num_epochs))
#        print('-' * 10)
#
#        # Each epoch has a training and validation phase
#        for phase in ['train', 'test']:
#            if phase == 'train':
#                i3d.train(True)
#            else:
#                i3d.train(False)  # Set model to evaluate mode
#                
#            tot_loss = 0.0
#            tot_loc_loss = 0.0
#            tot_cls_loss = 0.0
#            num_iter = 0
#            
#            running_corrects = 0
#            count = [0.] * 51
#            
#            # Iterate over data.
#            for bno, (inputs, vid_path, start_pts, end_pts, labels) in enumerate(dataloaders[phase]):
#                num_iter += 1
#                # wrap them in Variable
#                inputs = inputs.permute(0, 2, 1, 3, 4).float()      # for PIL and ToTensor
##                inputs = inputs.permute(0, 4, 1, 2, 3).float()      # for Raw Crops
#                inputs = inputs.to(device)
##                t = inputs.size(2)
#                labels = labels.to(device)
#
#                iter_counts = Counter(labels.tolist())
#                for k,v in iter_counts.items():
#                    count[k]+=v
#                    
#                optimizer.zero_grad()
#                
#                per_frame_logits = i3d(inputs)  # get B x N_CLASSES X 1
#                per_frame_logits = per_frame_logits.squeeze(2)
#                # upsample to input size
##                per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
#
#                # compute localization loss
##                loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
##                tot_loc_loss += loc_loss.data[0]
#
#                # compute classification loss (with max-pooling along time B x C x T)
##                cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
##                tot_cls_loss += cls_loss.data[0]
#                cls_loss = F.cross_entropy(per_frame_logits, labels)
#
##                loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
#                loss = cls_loss     #/num_steps_per_update
#                tot_loss += loss.item()
##                loss.backward()
#                
##                print("{}  : bno : {}".format(phase, bno))
#                                    # backward + optimize only if in training phase
#                if phase == 'train':
#                    loss.backward()
#                    optimizer.step()
#                    
#                running_corrects += torch.sum(torch.max(per_frame_logits, 1)[1] == labels.data)
#
###                if num_iter == num_steps_per_update and phase == 'train':
##                if phase == 'train':
##                    steps += 1
##                    num_iter = 0
##                    optimizer.step()
##                    optimizer.zero_grad()
##                    lr_sched.step()
##                    if steps % 10 == 0:
##                        print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10))
##                        # save model
##                        torch.save(i3d.state_dict(), save_model+str(steps).zfill(6)+'.pt')
##                        tot_loss = tot_loc_loss = tot_cls_loss = 0.
##                if (bno + 1) % 10 == 0:
##                    print('{} : {}/{} Loss: {:.4f} Corrects: {:.4f}'.format(phase, 
##                          bno, len(dataloaders[phase]), tot_loc_loss, running_corrects))
#                if bno == 1000:
#                    break
#            if phase == 'train':
#                lr_sched.step()
#                print("Category Weights : {}".format(count))
#            epoch_loss = tot_loss / (16*(bno+1))  #len(dataloaders[phase].dataset)
#            epoch_acc = running_corrects.double() / (16*(bno+1)) #  len(dataloaders[phase].dataset)
#            print('{} Loss: {:.6f} Acc: {:.6f} LR: {}'.format(phase, epoch_loss, epoch_acc, 
#                  lr_sched.get_last_lr()[0]))
#            
##            if phase == 'val':
##                print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter) )
#                
#            if (epoch+1) % 10 == 0:
#                torch.save(i3d.state_dict(), os.path.join(log_path, save_model+str(epoch+1).zfill(6)+'.pt'))
                
    i3d.load_state_dict(torch.load(os.path.join(log_path, save_model+str(num_epochs).zfill(6)+'.pt')))
    
                
    end = time.time()
    print("Total Execution time for {} epoch : {}".format(num_epochs, (end-start)))
    
    ###########################################################################
    
    # Predictions
    
    predict(i3d, dataloaders, 16, 'test')