def train(train_loader, model, optimizer, epoch, logger): losses = util.AverageMeter() topframe = util.AverageMeter() topVideoSoft = util.AverageMeter() # switch to train mode output_store_soft = [] target_store = [] index_vector = [] model.train() for i, (input_var, target_var, index) in enumerate(train_loader): target_var = target_var.to(DEVICE) input_var = input_var.to(DEVICE) # model pred_score = model(input_var) loss = F.cross_entropy(pred_score, target_var).sum() output_store_soft.append(F.softmax(pred_score, dim=1)) target_store.append(target_var) index_vector.append(index) # measure accuracy and record loss acc_iter = util.accuracy(pred_score.data, target_var, topk=(1,)) losses.update(loss.item(), input_var.size(0)) topframe.update(acc_iter[0], input_var.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if i % 200 == 0: logger.print('Epoch: [{:3d}][{:3d}/{:3d}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc_Iter@1 {topframe.val:.3f} ({topframe.avg:.3f})\t' .format( epoch, i, len(train_loader), loss=losses, topframe=topframe)) index_vector = torch.cat(index_vector, dim=0) # [256] ... [256] ---> [21570] index_matrix = [] for i in range(int(max(index_vector)) + 1): index_matrix.append(index_vector == i) index_matrix = torch.stack(index_matrix, dim=0).to(DEVICE).float() # [21570] ---> [380, 21570] output_store_soft = torch.cat(output_store_soft, dim=0) target_store = torch.cat(target_store, dim=0).float() # [256] ... [256] ---> [21570] output_store_soft = index_matrix.mm(output_store_soft) target_vector = index_matrix.mm(target_store.unsqueeze(1)).squeeze(1).div( index_matrix.sum(1)).long() # [380,21570] * [21570,1] -> [380,1] / sum([21570,1]) -> [380] prec_video_soft = util.accuracy(output_store_soft, target_vector, topk=(1,)) topVideoSoft.update(prec_video_soft[0].item(), i + 1) logger.print(' *Acc@Video_soft {topsoft.avg:.3f} *Acc@Frame {topframe.avg:.3f} '.format(topsoft=topVideoSoft, topframe=topframe))
def val(train_loader, model, logger): topframe = util.AverageMeter() topVideoSoft = util.AverageMeter() # switch to train mode output_store_soft = [] target_store = [] index_vector = [] model.eval() with torch.no_grad(): for i, (input_var, target_var, index) in enumerate(train_loader): target_var = target_var.to(DEVICE) input_var = input_var.to(DEVICE) # model pred_score = model(input_var) output_store_soft.append(F.softmax(pred_score, dim=1)) target_store.append(target_var) index_vector.append(index) # measure accuracy and record loss acc_iter = util.accuracy(pred_score.data, target_var, topk=(1, )) topframe.update(acc_iter[0], input_var.size(0)) index_vector = torch.cat(index_vector, dim=0) # [256] ... [256] ---> [21570] index_matrix = [] for i in range(int(max(index_vector)) + 1): index_matrix.append(index_vector == i) index_matrix = torch.stack( index_matrix, dim=0).to(DEVICE).float() # [21570] ---> [380, 21570] output_store_soft = torch.cat(output_store_soft, dim=0) target_store = torch.cat( target_store, dim=0).float() # [256] ... [256] ---> [21570] output_store_soft = index_matrix.mm(output_store_soft) target_vector = index_matrix.mm(target_store.unsqueeze(1)).squeeze( 1).div(index_matrix.sum(1)).long( ) # [380,21570] * [21570,1] -> [380,1] / sum([21570,1]) -> [380] prec_video_soft = util.accuracy(output_store_soft, target_vector, topk=(1, )) topVideoSoft.update(prec_video_soft[0].item(), i + 1) logger.print( ' *Acc@Video {topVideo.avg:.3f} '.format(topVideo=topVideoSoft)) return topVideoSoft.avg
def val(val_loader, model, at_type): topVideo = util.AverageMeter() # switch to evaluate mode model.eval() output_store_fc = [] output_alpha = [] target_store = [] index_vector = [] with torch.no_grad(): for i, (input_var, target, index) in enumerate(val_loader): # compute output target = target.to(DEVICE) input_var = input_var.to(DEVICE) ''' model & full_model''' f, alphas = model(input_var, phrase = 'eval') output_store_fc.append(f) output_alpha.append(alphas) target_store.append(target) index_vector.append(index) index_vector = torch.cat(index_vector, dim=0) # [256] ... [256] ---> [21570] index_matrix = [] for i in range(int(max(index_vector)) + 1): index_matrix.append(index_vector == i) index_matrix = torch.stack(index_matrix, dim=0).to(DEVICE).float() # [21570] ---> [380, 21570] output_store_fc = torch.cat(output_store_fc, dim=0) # [256,7] ... [256,7] ---> [21570, 7] output_alpha = torch.cat(output_alpha, dim=0) # [256,1] ... [256,1] ---> [21570, 1] target_store = torch.cat(target_store, dim=0).float() # [256] ... [256] ---> [21570] ''' keywords: mean_fc ; weight_sourcefc; sum_alpha; weightmean_sourcefc ''' weight_sourcefc = output_store_fc.mul(output_alpha) #[21570,512] * [21570,1] --->[21570,512] sum_alpha = index_matrix.mm(output_alpha) # [380,21570] * [21570,1] -> [380,1] weightmean_sourcefc = index_matrix.mm(weight_sourcefc).div(sum_alpha) target_vector = index_matrix.mm(target_store.unsqueeze(1)).squeeze(1).div( index_matrix.sum(1)).long() # [380,21570] * [21570,1] -> [380,1] / sum([21570,1]) -> [380] if at_type == 'self-attention': pred_score = model(vm=weightmean_sourcefc, phrase='eval', AT_level='pred') if at_type == 'self_relation-attention': pred_score = model(vectors=output_store_fc, vm=weightmean_sourcefc, alphas_from1=output_alpha, index_matrix=index_matrix, phrase='eval', AT_level='second_level') acc_video = util.accuracy(pred_score.cpu(), target_vector.cpu(), topk=(1,)) topVideo.update(acc_video[0], i + 1) logger.print(' *Acc@Video {topVideo.avg:.3f} '.format(topVideo=topVideo)) return topVideo.avg