def train_model(features, stroke_names_id, model, dataloaders, criterion, optimizer, scheduler, labs_keys, labs_values, num_epochs=25): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'test']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 # Iterate over data. for bno, (inputs, vid_path, stroke, _, labels) in enumerate(dataloaders[phase]): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM labels = attn_utils.get_batch_labels(vid_path, stroke, labs_keys, labs_values, inputs.shape[1]) # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16) inputs = inputs.float() inp_emb = attn_utils.get_long_tensor(inputs) # comment out for SA inputs = inp_emb.to(device) # comment out for SA inputs = inputs.t().contiguous() # Convert to (SEQ, BATCH) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward output = model(inputs) # output size (SEQ_SIZE, BATCH, NCLASSES) output = output.permute(1, 0, 2).contiguous() output = F.softmax(output.view(-1, output.shape[-1]), dim=1) # output = output.view(-1, output.shape[-1]) # To (BATCH*SEQ_SIZE, NCLUSTERS) loss = criterion(output, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() # track history if only in train _, preds = torch.max(output, 1) # statistics running_loss += loss.item() #* inputs.size(0) # print("Iter : {} :: Running Loss : {}".format(bno, running_loss)) running_corrects += torch.sum(preds == labels.data) # if bno==20: # break epoch_loss = running_loss / len(dataloaders[phase]) #.dataset) epoch_acc = running_corrects.double() / (inputs.size(0) * len(dataloaders[phase].dataset)) print('{} Loss: {:.4f} Acc: {:.4f} LR: {}'.format(phase, epoch_loss, epoch_acc, scheduler.get_lr()[0])) if phase == 'train': scheduler.step() # # deep copy the model for best test accuracy if phase == 'test' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, \ time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
def predict(features, stroke_names_id, model, dataloaders, labs_keys, labs_values, seq, phase="val"): assert phase == "val" or phase=="test", "Incorrect Phase." model = model.eval() gt_list, pred_list, stroke_ids = [], [], [] # Iterate over data. for bno, (inputs, vid_path, stroke, _, labels) in enumerate(dataloaders[phase]): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM seq = inputs.shape[1] labels = attn_utils.get_batch_labels(vid_path, stroke, labs_keys, labs_values, seq) inputs = inputs.float() inp_emb = attn_utils.get_long_tensor(inputs) # comment out for SA inputs = inp_emb.to(device) # comment out for SA inputs = inputs.t().contiguous() labels = labels.to(device) # forward with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) # output size (BATCH, SEQ_SIZE, NCLUSTERS) outputs = outputs.permute(1, 0, 2).contiguous() outputs = F.softmax(outputs.view(-1, outputs.shape[-1]), dim=1) gt_list.append(labels.tolist()) pred_list.append((torch.max(outputs, 1)[1]).tolist()) for i, vid in enumerate(vid_path): stroke_ids.extend([vid+"_"+str(stroke[0][i].item())+"_"+str(stroke[1][i].item())] * seq) # epoch_loss = running_loss #/ len(dataloaders[phase].dataset) # epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset) # print('{} Loss: {:.4f}'.format(phase, epoch_loss)) ########################################################################### confusion_mat = np.zeros((model.decoder.out_features, model.decoder.out_features)) gt_list = [g for batch_list in gt_list for g in batch_list] pred_list = [p for batch_list in pred_list for p in batch_list] predictions = {"gt": gt_list, "pred": pred_list} # Save prediction and ground truth labels with open(os.path.join(log_path, "preds_Seq"+str(seq)+"_C"+str(cluster_size)+".pkl"), "wb") as fp: pickle.dump(predictions, fp) with open(os.path.join(log_path, "preds_Seq"+str(seq)+"_C"+str(cluster_size)+".pkl"), "rb") as fp: predictions = pickle.load(fp) gt_list = predictions['gt'] pred_list = predictions['pred'] # # get boundaries (worse accuracy when used) # vkeys = list(set([v.rsplit('_', 2)[0] for v in stroke_ids])) # boundaries = read_boundaries(vkeys, HIST_DIFFS, SBD_MODEL) # prev_gt = stroke_ids[0] val_labels, pred_labels, vid_preds = [], [], [] for i, pr in enumerate(pred_list): if prev_gt != stroke_ids[i]: # find max category predicted in pred_labels val_labels.append(gt_list[i-1]) pred_labels.append(max(set(vid_preds), key = vid_preds.count)) vid_preds = [] prev_gt = stroke_ids[i] vid_preds.append(pr) val_labels.append(gt_list[-1]) pred_labels.append(max(set(vid_preds), key = vid_preds.count)) ########################################################################### correct = 0 for i,true_val in enumerate(val_labels): if pred_labels[i] == true_val: correct+=1 confusion_mat[pred_labels[i], true_val]+=1 print('#'*30) print("GRU Sequence Classification Results:") print("%d/%d Correct" % (correct, len(pred_labels))) print("Accuracy = {} ".format( float(correct) / len(pred_labels))) print("Confusion matrix") print(confusion_mat) return (float(correct) / len(pred_labels))
def extract_trans_feats(model, DATASET, LABELS, CLASS_IDS, BATCH_SIZE, SEQ_SIZE=16, STEP=16, partition='train', nstrokes=-1, base_name=""): ''' Extract sequence features from AutoEncoder. Parameters: ----------- model : tt.TransformerModel TransformerModel object DATASET : str path to the video dataset LABELS : str path containing stroke labels BATCH_SIZE : int size for batch of clips SEQ_SIZE : int no. of frames in a clip STEP : int stride for next example. If SEQ_SIZE=16, STEP=8, use frames (0, 15), (8, 23) ... partition : str 'train' / 'test' / 'val' : Videos to be considered nstrokes : int partial extraction of features (do not execute for entire dataset) base_name : str path containing the pickled feature dumps Returns: -------- features_dictionary, stroke_names ''' ########################################################################### # Read the strokes # Divide the highlight dataset files into training, validation and test sets train_lst, val_lst, test_lst = autoenc_utils.split_dataset_files(DATASET) print("No. of training videos : {}".format(len(train_lst))) ##################################################################### if partition == 'train': partition_lst = train_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_train.pkl") elif partition == 'val': partition_lst = val_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_val.pkl") elif partition == 'test': partition_lst = test_lst ft_path = os.path.join(base_name, "C" + str(cluster_size) + "_test.pkl") else: print("Partition should be : train / val / test") return ########################################################################### # Create a Dataset part_dataset = StrokeFeatureSequenceDataset(ft_path, partition_lst, DATASET, LABELS, CLASS_IDS, frames_per_clip=SEQ_SIZE, extracted_frames_per_clip=2, step_between_clips=STEP, train=True) data_loader = DataLoader(dataset=part_dataset, batch_size=BATCH_SIZE, shuffle=False) ########################################################################### # Validate / Evaluate model.eval() stroke_names = [] trajectories, stroke_traj = [], [] num_strokes = 0 prev_stroke = None print("Total Batches : {} :: BATCH_SIZE : {}".format( data_loader.__len__(), BATCH_SIZE)) ########################################################################### for bno, (inputs, vid_path, stroke, labels) in enumerate(data_loader): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM inputs = inputs.float() inp_emb = attn_utils.get_long_tensor(inputs) # comment out for SA inputs = inp_emb.t().contiguous().to(device) # comment out for SA # forward # track history if only in train with torch.set_grad_enabled(False): outputs = model.get_vec( inputs) # output size (BATCH, SEQ_SIZE, NCLUSTERS) outputs = outputs.transpose(0, 1).contiguous() # output = output.view(-1, INPUT_SIZE) # To (BATCH*SEQ_SIZE, NCLUSTERS) # loss = criterion(output, targets) # batch_size = inputs.size(0) # enc_h = encoder.init_hidden(batch_size) # enc_out, h = encoder(inputs, enc_h) # dec_h = h # dec_in = torch.zeros(batch_size, inputs.size(2)).to(device) # dec_out_lst = [] # target_length = inputs.size(1) # assign SEQ_LEN as target length for now # # run for each word of the sequence (use teacher forcing) # for ti in range(target_length): # dec_out, dec_h, dec_attn = decoder(dec_h, enc_out, dec_in) # dec_out_lst.append(dec_out) # dec_in = dec_out # outputs = torch.stack(dec_out_lst, dim=1) # convert to start frames and end frames from tensors to lists stroke = [s.tolist() for s in stroke] # outputs are the reconstructed features. Use compressed enc_out values(maybe wtd.). inputs_lst, batch_stroke_names = autoenc_utils.separate_stroke_tensors(outputs, \ vid_path, stroke) # for sequence of features from batch segregated extracted features. if bno == 0: prev_stroke = batch_stroke_names[0] for enc_idx, enc_input in enumerate(inputs_lst): # get no of sequences that can be extracted from enc_input tensor nSeqs = enc_input.size(0) if prev_stroke != batch_stroke_names[enc_idx]: # append old stroke to trajectories if len(stroke_traj) > 0: num_strokes += 1 trajectories.append(stroke_traj) stroke_names.append(prev_stroke) stroke_traj = [] # enc_output = model.encoder(enc_input.to(device)) # enc_output = enc_output.squeeze(axis=1).cpu().data.numpy() enc_output = enc_input.cpu().data.numpy() # convert to [[[stroke1(size 32 each) ... ], [], ...], [ [], ... ]] stroke_traj.extend([enc_output[i,j,:] for i in range(enc_output.shape[0]) \ for j in range(enc_output.shape[1])]) prev_stroke = batch_stroke_names[enc_idx] if nstrokes > -1 and num_strokes >= nstrokes: break # for last batch only if extracted for full dataset if len(stroke_traj) > 0 and nstrokes < 0: trajectories.append(stroke_traj) stroke_names.append(batch_stroke_names[-1]) # convert to dictionary of features with keys as stroke names(with ext). features = {} for i, t in enumerate(trajectories): features[stroke_names[i]] = np.array(t) # trajectories, stroke_names = autoenc_utils.group_strokewise(trajectories, stroke_names) return features, stroke_names
def train_model(features, stroke_names_id, model, dataloaders, criterion, optimizer, scheduler, labs_keys, labs_values, num_epochs=25): since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'test']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 count = [0.] * 5 # Iterate over data. for bno, (inputs, vid_path, stroke, labels) in enumerate(dataloaders[phase]): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM labels = attn_utils.get_batch_labels(vid_path, stroke, labs_keys, labs_values, 1) # Extract spatio-temporal features from clip using 3D ResNet (For SL >= 16) inputs = inputs.float() inp_emb = attn_utils.get_long_tensor( inputs) # comment out for SA inputs = inp_emb.to(device) # comment out for SA inputs = inputs.to(device) labels = labels.to(device) iter_counts = Counter(labels.tolist()) for k, v in iter_counts.items(): count[k] += v # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): hidden = model.init_hidden(inputs.size(0)) outputs, hidden = model(inputs, hidden) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) #torch.flip(targets, [1]) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) # print("Iter : {} :: Running Loss : {}".format(bno, running_loss)) running_corrects += torch.sum(preds == labels.data) if phase == 'train': scheduler.step() print("Category Weights : {}".format(count)) epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len( dataloaders[phase].dataset) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) # # deep copy the model for best test accuracy if phase == 'test' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, \ time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # # load best model weights model.load_state_dict(best_model_wts) return model
def predict(model, dataloaders, seq, phase="val"): assert phase == "val" or phase == "test", "Incorrect Phase." model = model.eval() gt_list, pred_list, stroke_ids = [], [], [] count = [0.] * cluster_size # Iterate over data. for bno, (inputs, vid_path, start_pts, labels) in enumerate(dataloaders[phase]): # inputs of shape BATCH x SEQ_LEN x FEATURE_DIM inputs = inputs.float() inp_emb = attn_utils.get_long_tensor(inputs) # comment out for SA inputs = inp_emb.to(device) # comment out for SA inputs = inputs.to(device) labels = labels.to(device) iter_counts = Counter(inp_emb.flatten().tolist()) for k, v in iter_counts.items(): count[k] += v # forward with torch.set_grad_enabled(phase == 'train'): batch_size = inputs.size(0) hidden = model.init_hidden(batch_size) outputs, hidden = model(inputs, hidden) gt_list.append(labels.tolist()) pred_list.append((torch.max(outputs, 1)[1]).tolist()) for i, vid in enumerate(vid_path): stroke_ids.extend([vid] * 1) ########################################################################### print("Clusters : ") print(count) confusion_mat = np.zeros((model.n_classes, model.n_classes)) gt_list = [g for batch_list in gt_list for g in batch_list] pred_list = [p for batch_list in pred_list for p in batch_list] predictions = {"gt": gt_list, "pred": pred_list} # Save prediction and ground truth labels with open( os.path.join( log_path, "preds_Seq" + str(seq) + "_C" + str(cluster_size) + ".pkl"), "wb") as fp: pickle.dump(predictions, fp) with open( os.path.join( log_path, "preds_Seq" + str(seq) + "_C" + str(cluster_size) + ".pkl"), "rb") as fp: predictions = pickle.load(fp) gt_list = predictions['gt'] pred_list = predictions['pred'] prev_gt = stroke_ids[0] val_labels, pred_labels, vid_preds = [], [], [] for i, pr in enumerate(pred_list): if prev_gt != stroke_ids[i]: # find max category predicted in pred_labels val_labels.append(gt_list[i - 1]) pred_labels.append(max(set(vid_preds), key=vid_preds.count)) vid_preds = [] prev_gt = stroke_ids[i] vid_preds.append(pr) val_labels.append(gt_list[-1]) pred_labels.append(max(set(vid_preds), key=vid_preds.count)) ########################################################################### correct = 0 for i, true_val in enumerate(val_labels): if pred_labels[i] == true_val: correct += 1 confusion_mat[pred_labels[i], true_val] += 1 print('#' * 30) print("GRU Sequence Classification Results:") print("%d/%d Correct" % (correct, len(pred_labels))) print("Accuracy = {} ".format(float(correct) / len(pred_labels))) print("Confusion matrix") print(confusion_mat) return (float(correct) / len(pred_labels))