def validate(args, val_dataloader, nets, iteration=0, iou_thresh=0.5): """ Test the model on validation set """ # write results to files for evaluation output_files = [] fouts = [] for i in range(args.max_iter): output_file = args.save_root + 'val_result-' + str( iteration) + '-iter' + str(i + 1) + '.csv' output_files.append(output_file) f = open(output_file, 'w') fouts.append(f) gt_file = args.save_root + 'val_gt.csv' fout = open(gt_file, 'w') with torch.no_grad(): # for evaluation for num, (images, targets, tubes, infos) in enumerate(val_dataloader): if (num + 1) % 100 == 0: print("%d / %d" % (num + 1, len(val_dataloader.dataset) / args.batch_size)) for b in range(len(infos)): for n in range(len(infos[b]['boxes'])): mid = int(len(infos[b]['boxes'][n]) / 2) box = infos[b]['boxes'][n][mid] labels = infos[b]['labels'][n][mid] for label in labels: fout.write( '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6}\n'. format(infos[b]['video_name'], infos[b]['fid'], box[0], box[1], box[2], box[3], label)) _, _, channels, height, width = images.size() images = images.cuda() # get conv features conv_feat = nets['base_net'](images) context_feat = None if not args.no_context: context_feat = nets['context_net'](conv_feat) ############## Inference ############## history, _ = inference(args, conv_feat, context_feat, nets, args.max_iter, tubes) #################### Evaluation ################# # loop for each iteration for i in range(len(history)): pred_prob = history[i]['pred_prob'].cpu() pred_prob = pred_prob[:, int(pred_prob.shape[1] / 2)] pred_tubes = history[i]['pred_loc'].cpu() pred_tubes = pred_tubes[:, int(pred_tubes.shape[1] / 2)] tubes_nums = history[i]['tubes_nums'] # loop for each sample in a batch tubes_count = 0 for b in range(len(tubes_nums)): info = infos[b] seq_start = tubes_count tubes_count = tubes_count + tubes_nums[b] cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]] cur_pred_tubes = pred_tubes[seq_start:seq_start + tubes_nums[b]] # do NMS first all_scores = [] all_boxes = [] all_idx = [] for cl_ind in range(args.num_classes): scores = cur_pred_prob[:, cl_ind].squeeze().reshape(-1) c_mask = scores.gt( args.conf_thresh) # greater than minmum threshold scores = scores[c_mask] idx = np.where(c_mask.numpy())[0] if len(scores) == 0: all_scores.append([]) all_boxes.append([]) continue boxes = cur_pred_tubes.clone() l_mask = c_mask.unsqueeze(1).expand_as(boxes) boxes = boxes[l_mask].view(-1, 4) boxes = valid_tubes(boxes.view(-1, 1, 4)).view(-1, 4) keep = nms(boxes, scores, args.nms_thresh) boxes = boxes[keep].numpy() scores = scores[keep].numpy() idx = idx[keep] boxes[:, ::2] /= width boxes[:, 1::2] /= height all_scores.append(scores) all_boxes.append(boxes) all_idx.append(idx) # get the top scores scores_list = [(s, cl_ind, j) for cl_ind, scores in enumerate(all_scores) for j, s in enumerate(scores)] if args.evaluate_topk > 0: scores_list.sort(key=lambda x: x[0]) scores_list = scores_list[::-1] scores_list = scores_list[:args.topk] for s, cl_ind, j in scores_list: # write to files box = all_boxes[cl_ind][j] fouts[i].write( '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n' .format(info['video_name'], info['fid'], box[0], box[1], box[2], box[3], label_dict[cl_ind], s)) fout.close() all_metrics = [] for i in range(args.max_iter): fouts[i].close() metrics = ava_evaluation(os.path.join(args.data_root, 'label/'), output_files[i], gt_file) all_metrics.append(metrics) return all_metrics
def main(): ################## Customize your configuratons here ################### checkpoint_path = 'pretrained/ava_step.pth' if os.path.isfile(checkpoint_path): print("Loading pretrain model from %s" % checkpoint_path) map_location = 'cuda:0' checkpoint = torch.load(checkpoint_path, map_location=map_location) args = checkpoint['cfg'] else: raise ValueError("Pretrain model not found!", checkpoint_path) # TODO: Set data_root to the customized input dataset args.data_root = '/datasets/demo/frames/' args.save_root = os.path.join(os.path.dirname(args.data_root), 'results/') if not os.path.isdir(args.save_root): os.makedirs(args.save_root) # TODO: modify this setting according to the actual frame rate and file name source_fps = 30 im_format = 'frame%04d.jpg' conf_thresh = 0.4 global_thresh = 0.8 # used for cross-class NMS ################ Define models ################# gpu_count = torch.cuda.device_count() nets = OrderedDict() # backbone network nets['base_net'] = BaseNet(args) # ROI pooling nets['roi_net'] = ROINet(args.pool_mode, args.pool_size) # detection network for i in range(args.max_iter): if args.det_net == "two_branch": nets['det_net%d' % i] = TwoBranchNet(args) else: raise NotImplementedError if not args.no_context: # context branch nets['context_net'] = ContextNet(args) for key in nets: nets[key] = nets[key].cuda() nets['base_net'] = torch.nn.DataParallel(nets['base_net']) if not args.no_context: nets['context_net'] = torch.nn.DataParallel(nets['context_net']) for i in range(args.max_iter): nets['det_net%d' % i].to('cuda:%d' % ((i + 1) % gpu_count)) nets['det_net%d' % i].set_device('cuda:%d' % ((i + 1) % gpu_count)) # load pretrained model nets['base_net'].load_state_dict(checkpoint['base_net']) if not args.no_context and 'context_net' in checkpoint: nets['context_net'].load_state_dict(checkpoint['context_net']) for i in range(args.max_iter): pretrained_dict = checkpoint['det_net%d' % i] nets['det_net%d' % i].load_state_dict(pretrained_dict) ################ DataLoader setup ################# dataset = CustomizedDataset(args.data_root, args.T, args.NUM_CHUNKS[args.max_iter], source_fps, args.fps, BaseTransform(args.image_size, args.means, args.stds, args.scale_norm), anchor_mode=args.anchor_mode, im_format=im_format) dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=detection_collate, pin_memory=True) ################ Inference ################# for _, net in nets.items(): net.eval() fout = open(os.path.join(args.save_root, 'results.txt'), 'w') torch.cuda.synchronize() t0 = time.time() with torch.no_grad(): for _, (images, tubes, infos) in enumerate(dataloader): _, _, channels, height, width = images.size() images = images.cuda() # get conv features conv_feat = nets['base_net'](images) context_feat = None if not args.no_context: context_feat = nets['context_net'](conv_feat) history, _ = inference(args, conv_feat, context_feat, nets, args.max_iter, tubes) # collect result of the last step pred_prob = history[-1]['pred_prob'].cpu() pred_prob = pred_prob[:, int(pred_prob.shape[1] / 2)] pred_tubes = history[-1]['pred_loc'].cpu() pred_tubes = pred_tubes[:, int(pred_tubes.shape[1] / 2)] tubes_nums = history[-1]['tubes_nums'] # loop for each batch tubes_count = 0 for b in range(len(tubes_nums)): info = infos[b] seq_start = tubes_count tubes_count = tubes_count + tubes_nums[b] cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]] cur_pred_tubes = pred_tubes[seq_start:seq_start + tubes_nums[b]] # do NMS first all_scores = [] all_boxes = [] all_idx = [] for cl_ind in range(args.num_classes): scores = cur_pred_prob[:, cl_ind].squeeze() c_mask = scores.gt(conf_thresh) # greater than a threshold scores = scores[c_mask] idx = np.where(c_mask.numpy())[0] if len(scores) == 0: all_scores.append([]) all_boxes.append([]) continue boxes = cur_pred_tubes.clone() l_mask = c_mask.unsqueeze(1).expand_as(boxes) boxes = boxes[l_mask].view(-1, 4) boxes = valid_tubes(boxes.view(-1, 1, 4)).view(-1, 4) keep = nms(boxes, scores, args.nms_thresh) boxes = boxes[keep].numpy() scores = scores[keep].numpy() idx = idx[keep] boxes[:, ::2] /= width boxes[:, 1::2] /= height all_scores.append(scores) all_boxes.append(boxes) all_idx.append(idx) # get the top scores scores_list = [(s, cl_ind, j) for cl_ind, scores in enumerate(all_scores) for j, s in enumerate(scores)] if args.evaluate_topk > 0: scores_list.sort(key=lambda x: x[0]) scores_list = scores_list[::-1] scores_list = scores_list[:args.topk] # merge high overlapping boxes (a simple greedy method) merged_result = {} flag = [1 for _ in range(len(scores_list))] for i in range(len(scores_list)): if flag[i]: s, cl_ind, j = scores_list[i] box = all_boxes[cl_ind][j] temp = ([box], [args.label_dict[cl_ind]], [s]) # find all high IoU boxes for ii in range(i + 1, len(scores_list)): if flag[ii]: s2, cl_ind2, j2 = scores_list[ii] box2 = all_boxes[cl_ind2][j2] if compute_box_iou(box, box2) > global_thresh: flag[ii] = 0 temp[0].append(box2) temp[1].append(args.label_dict[cl_ind2]) temp[2].append(s2) merged_box = np.mean(np.concatenate(temp[0], axis=0).reshape( -1, 4), axis=0) key = ','.join(merged_box.astype(str).tolist()) merged_result[key] = [ (l, s) for l, s in zip(temp[1], temp[2]) ] # visualize results if not os.path.isdir( os.path.join(args.save_root, info['video_name'])): os.makedirs( os.path.join(args.save_root, info['video_name'])) print(info) overlay_image(os.path.join(args.data_root, info['video_name'], im_format % info['fid']), os.path.join(args.save_root, info['video_name'], im_format % info['fid']), pred_boxes=merged_result, id2class=args.id2class) # write to files for key in merged_result: box = np.asarray(key.split(','), dtype=np.float32) for l, s in merged_result[key]: fout.write( '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n' .format(info['video_name'], info['fid'], box[0], box[1], box[2], box[3], l, s)) torch.cuda.synchronize() t1 = time.time() print("Batch time: ", t1 - t0) torch.cuda.synchronize() t0 = time.time() fout.close()
def train_select(step, history, targets, tubes, args): """ Select candidate samples for model training Arguments: step: int, the current step history: dict, inference output targets: list, ground truths tubes: np.array, initial proposals args: configs """ # adaptively get the start chunk chunks = args.NUM_CHUNKS[step] max_chunks = args.NUM_CHUNKS[args.max_iter] T_start = int((args.NUM_CHUNKS[args.max_iter] - chunks) / 2) * args.T T_length = chunks * args.T cls_thresh = args.cls_thresh[step - 1] reg_thresh = args.reg_thresh[step - 1] ######### Collect candidates for training ######## candidates = [] if step > 1: # for step > 1 pred_prob = history['pred_prob'].cpu() pred_tubes = history['pred_loc'].cpu() tubes_nums = history['tubes_nums'] tubes_count = 0 if args.temporal_mode == "predict": pred_first_loc = history['pred_first_loc'].cpu() pred_last_loc = history['pred_last_loc'].cpu() for b in range(len(targets)): if step == 1: # for 1st step candidates.append((tubes[b], None)) else: # for step > 1 seq_start = tubes_count tubes_count = tubes_count + tubes_nums[b] cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]] # get averaged score for each tube cur_pred_prob = torch.mean(cur_pred_prob, dim=1) cur_pred_tubes = pred_tubes[seq_start:seq_start + tubes_nums[b]] # select top-scoring boxes from each class all_scores = [] all_idx = [] for cl_ind in range(args.num_classes): scores = cur_pred_prob[:, cl_ind].squeeze() # sort according to the scores scores = scores.numpy().reshape(-1) ids = np.argsort(scores)[::-1] scores = scores[ids] idx = ids if args.topk > 0: scores = scores[:int(args.topk / args.num_classes) * 2] idx = idx[:int(args.topk / args.num_classes) * 2] all_scores.append(scores) all_idx.append(idx) # get the top scores scores_list = [(s, cl_ind, j) for cl_ind, scores in enumerate(all_scores) for j, s in enumerate(scores)] scores_list.sort(key=lambda x: x[0]) scores_list = scores_list[::-1] temp_list = [] temp = set() for s, cl_ind, j in scores_list: if not all_idx[cl_ind][j] in temp: temp.add(all_idx[cl_ind][j]) temp_list.append((s, cl_ind, j)) if args.topk > 0: scores_list = temp_list[:args.topk] else: scores_list = temp_list cur_tubes = [] cur_scores = [] for s, cl_ind, j in scores_list: cur_tubes.append(cur_pred_tubes[all_idx[cl_ind][j], :].numpy()) cur_scores.append(s) try: cur_tubes = np.stack(cur_tubes, axis=0) except: pdb.set_trace() cur_tubes = valid_tubes(cur_tubes, args.image_size[0], args.image_size[1]) cur_scores = np.asarray(cur_scores) if args.temporal_mode == "predict": cur_pred_first_loc = pred_first_loc[seq_start:seq_start + tubes_nums[b]] cur_pred_last_loc = pred_last_loc[seq_start:seq_start + tubes_nums[b]] cur_first_tubes = [] cur_last_tubes = [] for s, cl_ind, j in scores_list: cur_first_tubes.append( cur_pred_first_loc[all_idx[cl_ind][j], :]) cur_last_tubes.append( cur_pred_last_loc[all_idx[cl_ind][j], :]) cur_first_tubes = np.stack(cur_first_tubes, axis=0) cur_first_tubes = valid_tubes(cur_first_tubes, args.image_size[0], args.image_size[1]) cur_last_tubes = np.stack(cur_last_tubes, axis=0) cur_last_tubes = valid_tubes(cur_last_tubes, args.image_size[0], args.image_size[1]) else: cur_first_tubes, cur_last_tubes = None, None candidates.append( (cur_tubes, cur_scores, cur_first_tubes, cur_last_tubes)) ######### Select training samples ######## selected_tubes = [] target_tubes = [] for b in range(len(targets)): cur_tubes = candidates[b][0] cur_scores = candidates[b][1] selected_pos, selected_neg, ious = select_proposals( targets[b][:, int(max_chunks / 2)].reshape(targets[b].shape[0], 1, -1), cur_tubes[:, int(cur_tubes.shape[1] / 2)].reshape( cur_tubes.shape[0], 1, -1), cur_scores, cls_thresh, args.max_pos_num, args.selection_sampling, args.neg_ratio) cur_selected_tubes = np.zeros( (len(selected_pos) + len(selected_neg), cur_tubes.shape[1], 4), dtype=np.float32) cur_target_tubes = np.zeros( (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes), dtype=np.float32) # only one frame for loss row = 0 for ii, jj in selected_pos: cur_selected_tubes[row] = cur_tubes[jj] cur_target_tubes[row, :, :4] = targets[b][ii, int(max_chunks / 2), :4] cur_target_tubes[row, :, 6:] = targets[b][ii, int(max_chunks / 2), 4:] cur_target_tubes[row, :, 5] = 1 # flag for regression cur_target_tubes[row, :, 4] = 1 # flag for classification row += 1 for ii, jj in selected_neg: cur_selected_tubes[row] = cur_tubes[jj] # for regreesion only samples if ious[ii, jj] >= reg_thresh: cur_target_tubes[row, :, :4] = targets[b][ii, int(max_chunks / 2), :4] cur_target_tubes[row, :, 6:] = targets[b][ii, int(max_chunks / 2), 4:] cur_target_tubes[row, :, 5] = 1 # for regression # FIXME: cur_target_tubes[row,:,4] = 1 # flag for classification row += 1 ###### check whether extend tube is needed ###### if step - 1 in args.NUM_CHUNKS and args.NUM_CHUNKS[ step] == args.NUM_CHUNKS[step - 1] + 2: if args.temporal_mode == "predict": cur_first_tubes = candidates[b][2] cur_last_tubes = candidates[b][3] cur_selected_first = np.zeros( (len(selected_pos) + len(selected_neg), args.T, 4), dtype=np.float32) cur_selected_last = np.zeros( (len(selected_pos) + len(selected_neg), args.T, 4), dtype=np.float32) row = 0 for ii, jj in selected_pos: cur_selected_first[row] = cur_first_tubes[jj] cur_selected_last[row] = cur_last_tubes[jj] row += 1 for ii, jj in selected_neg: cur_selected_first[row] = cur_first_tubes[jj] cur_selected_last[row] = cur_last_tubes[jj] row += 1 cur_selected_tubes = np.concatenate([ cur_selected_first, cur_selected_tubes, cur_selected_last ], axis=1) elif args.temporal_mode == "extrapolate": # linear extrapolation cur_selected_tubes = extrapolate_tubes(cur_selected_tubes, args.T) else: # mean tubes mean_tubes = np.mean(cur_selected_tubes, axis=1, keepdims=True) mean_tubes = np.tile(mean_tubes, (1, args.T, 1)) cur_selected_tubes = np.concatenate( (mean_tubes, cur_selected_tubes, mean_tubes), axis=1) ###### check whether predicting neighbor is needed ###### cur_target_first = np.zeros( (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes), dtype=np.float32) cur_target_last = np.zeros( (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes), dtype=np.float32) if args.temporal_mode == "predict" and step < args.max_iter and args.NUM_CHUNKS[ step + 1] == args.NUM_CHUNKS[step] + 2: row = 0 for ii, jj in selected_pos: cur_target_first[row, :, :4] = targets[b][ii, int((T_start - args.T) / args.T), :4] if cur_target_first[row, :, :4].sum() > 0: # valid box cur_target_first[row, :, 5] = 1 cur_target_first[row, :, 6:] = targets[b][ii, int((T_start - args.T) / args.T), 4:] cur_target_last[row, :, :4] = targets[b][ii, int((T_start + T_length) / args.T), :4] if cur_target_last[row, :, :4].sum() > 0: # valid box cur_target_last[row, :, 5] = 1 cur_target_last[row, :, 6:] = targets[b][ii, int((T_start + T_length) / args.T), 4:] row += 1 cur_target_tubes = np.concatenate( [cur_target_first, cur_target_tubes, cur_target_last], axis=1) selected_tubes.append(cur_selected_tubes) target_tubes.append(cur_target_tubes) return selected_tubes, target_tubes
def main(): ################## Load pretrained model and configurations ################### checkpoint_path = 'pretrained/ava_step.pth' if os.path.isfile(checkpoint_path): print ("Loading pretrain model from %s" % checkpoint_path) map_location = 'cuda:0' checkpoint = torch.load(checkpoint_path, map_location=map_location) args = checkpoint['cfg'] else: raise ValueError("Pretrain model not found!", checkpoint_path) if not os.path.isdir(args.save_root): os.makedirs(args.save_root) label_dict = {} if args.num_classes == 60: label_map = os.path.join(args.data_root, 'label/ava_action_list_v2.1_for_activitynet_2018.pbtxt') categories, class_whitelist = read_labelmap(open(label_map, 'r')) classes = [(val['id'], val['name']) for val in categories] id2class = {c[0]: c[1] for c in classes} # gt class id (1~80) --> class name for i, c in enumerate(sorted(list(class_whitelist))): label_dict[i] = c else: for i in range(80): label_dict[i] = i+1 ################ Define models ################# gpu_count = torch.cuda.device_count() nets = OrderedDict() # backbone network nets['base_net'] = BaseNet(args) # ROI pooling nets['roi_net'] = ROINet(args.pool_mode, args.pool_size) # detection network for i in range(args.max_iter): if args.det_net == "two_branch": nets['det_net%d' % i] = TwoBranchNet(args) else: raise NotImplementedError if not args.no_context: # context branch nets['context_net'] = ContextNet(args) for key in nets: nets[key] = nets[key].cuda() nets['base_net'] = torch.nn.DataParallel(nets['base_net']) if not args.no_context: nets['context_net'] = torch.nn.DataParallel(nets['context_net']) for i in range(args.max_iter): nets['det_net%d' % i].to('cuda:%d' % ((i+1)%gpu_count)) nets['det_net%d' % i].set_device('cuda:%d' % ((i+1)%gpu_count)) # load pretrained weights nets['base_net'].load_state_dict(checkpoint['base_net']) if not args.no_context and 'context_net' in checkpoint: nets['context_net'].load_state_dict(checkpoint['context_net']) for i in range(args.max_iter): pretrained_dict = checkpoint['det_net%d' % i] nets['det_net%d' % i].load_state_dict(pretrained_dict) ################ DataLoader setup ################# dataset = AVADataset(args.data_root, 'test', args.input_type, args.T, args.NUM_CHUNKS[args.max_iter], args.fps, BaseTransform(args.image_size, args.means, args.stds,args.scale_norm), proposal_path=args.proposal_path_val, stride=1, anchor_mode=args.anchor_mode, num_classes=args.num_classes, foreground_only=False) dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=detection_collate, pin_memory=True) ################ Inference ################# for _, net in nets.items(): net.eval() # write results to files for evaluation output_files = [] fouts = [] for i in range(args.max_iter): output_file = args.save_root+'testing_result-iter'+str(i+1)+'.csv' output_files.append(output_file) f = open(output_file, 'w') fouts.append(f) gt_file = args.save_root+'testing_gt.csv' fout = open(gt_file, 'w') torch.cuda.synchronize() t0 = time.time() with torch.no_grad(): # for evaluation for num, (images, targets, tubes, infos) in enumerate(dataloader): if (num+1) % 100 == 0: print ("%d / %d" % (num+1, len(dataloader.dataset)/args.batch_size)) for b in range(len(infos)): for n in range(len(infos[b]['boxes'])): mid = int(len(infos[b]['boxes'][n])/2) box = infos[b]['boxes'][n][mid] labels = infos[b]['labels'][n][mid] for label in labels: fout.write('{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6}\n'.format( infos[b]['video_name'], infos[b]['fid'], box[0], box[1], box[2], box[3], label)) _, _, channels, height, width = images.size() images = images.cuda() # get conv features conv_feat = nets['base_net'](images) context_feat = None if not args.no_context: context_feat = nets['context_net'](conv_feat) ############## Inference ############## history, _ = inference(args, conv_feat, context_feat, nets, args.max_iter, tubes) #################### Evaluation ################# # loop for each iteration for i in range(len(history)): pred_prob = history[i]['pred_prob'].cpu() pred_prob = pred_prob[:,int(pred_prob.shape[1]/2)] pred_tubes = history[i]['pred_loc'].cpu() pred_tubes = pred_tubes[:,int(pred_tubes.shape[1]/2)] tubes_nums = history[i]['tubes_nums'] # loop for each sample in a batch tubes_count = 0 for b in range(len(tubes_nums)): info = infos[b] seq_start = tubes_count tubes_count = tubes_count + tubes_nums[b] cur_pred_prob = pred_prob[seq_start:seq_start+tubes_nums[b]] cur_pred_tubes = pred_tubes[seq_start:seq_start+tubes_nums[b]] # do NMS first all_scores = [] all_boxes = [] all_idx = [] for cl_ind in range(args.num_classes): scores = cur_pred_prob[:, cl_ind].squeeze().reshape(-1) c_mask = scores.gt(args.conf_thresh) # greater than minmum threshold scores = scores[c_mask] idx = np.where(c_mask.numpy())[0] if len(scores) == 0: all_scores.append([]) all_boxes.append([]) continue boxes = cur_pred_tubes.clone() l_mask = c_mask.unsqueeze(1).expand_as(boxes) boxes = boxes[l_mask].view(-1, 4) boxes = valid_tubes(boxes.view(-1,1,4)).view(-1,4) keep = nms(boxes, scores, args.nms_thresh) boxes = boxes[keep].numpy() scores = scores[keep].numpy() idx = idx[keep] boxes[:, ::2] /= width boxes[:, 1::2] /= height all_scores.append(scores) all_boxes.append(boxes) all_idx.append(idx) # get the top scores scores_list = [(s,cl_ind,j) for cl_ind,scores in enumerate(all_scores) for j,s in enumerate(scores)] if args.evaluate_topk > 0: scores_list.sort(key=lambda x: x[0]) scores_list = scores_list[::-1] scores_list = scores_list[:args.topk] for s,cl_ind,j in scores_list: # write to files box = all_boxes[cl_ind][j] fouts[i].write('{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n'.format( info['video_name'], info['fid'], box[0],box[1],box[2],box[3], label_dict[cl_ind], s)) fout.close() all_metrics = [] for i in range(args.max_iter): fouts[i].close() metrics = ava_evaluation(os.path.join(args.data_root, 'label/'), output_files[i], gt_file) all_metrics.append(metrics) # Logging log_name = args.save_root+"testing_results.log" log_file = open(log_name, "w", 1) prt_str = '' for i in range(args.max_iter): prt_str += 'Iter '+str(i+1)+': MEANAP =>'+str(all_metrics[i]['PascalBoxes_Precision/[email protected]'])+'\n' log_file.write(prt_str) for i in class_whitelist: log_file.write("({}) {}: {}\n".format(i,id2class[i], all_metrics[-1]["PascalBoxes_PerformanceByCategory/[email protected]/{}".format(id2class[i])])) log_file.close()
def inference(args, conv_feat, context_feat, nets, exec_iter, tubes): """ Inference on two-branch networks of different steps. In training, it is used to collect all candidate tubes. In testing, it is used to get detection results of each step Arguments: conv_feat: conv features from the backbone network context_feat: context features from the context network (None if the context network is not used) nets: a list of two-branch networks exec_iter: the number of iterations to execute tubes: initial proposal tubes return: history: collecting output results for each iteration trajectory: collecting input for each iteration """ # flatten list of tubes flat_tubes, tubes_nums = flatten_tubes( tubes, batch_idx=True) # add batch_idx for ROI pooling flat_tubes = torch.FloatTensor(flat_tubes).to(conv_feat) history = [] trajectory = [] for i in range(1, exec_iter + 1): # index from 1 # adaptively get the start chunk chunks = args.NUM_CHUNKS[i] T_start = int((args.NUM_CHUNKS[args.max_iter] - chunks) / 2) * args.T T_length = chunks * args.T chunk_idx = [j * args.T + int(args.T / 2) for j in range(chunks) ] # used to index the middel frame of each chunk half_T = int(args.T / 2) # ROI Pooling pooled_feat = nets['roi_net']( conv_feat[:, T_start:T_start + T_length].contiguous(), flat_tubes) _, C, W, H = pooled_feat.size() pooled_feat = pooled_feat.view(-1, T_length, C, W, H) # detection head temp_context_feat = None if not args.no_context: temp_context_feat = torch.zeros( (pooled_feat.size(0), context_feat.size(1), T_length, 1, 1)).to(context_feat) for p in range(pooled_feat.size(0)): temp_context_feat[p] = context_feat[ int(flat_tubes[p, 0, 0].item() / T_length), :, T_start:T_start + T_length].contiguous().clone() global_prob, local_loc, first_loc, last_loc, _, _, _ = nets[ 'det_net%d' % (i - 1)](pooled_feat, context_feat=temp_context_feat, tubes=None, targets=None) ########## prepare data for next iteration ########### pred_prob = global_prob.view(-1, 1, args.num_classes).expand( -1, T_length, -1) # decode regression results to output tubes flat_tubes = flat_tubes.to(local_loc) pred_loc = decode_coef( flat_tubes.view(-1, 5)[:, 1:], local_loc.view(-1, 4)) pred_loc = pred_loc.view(local_loc.size()) if args.temporal_mode == "predict": pred_first_loc = decode_coef( flat_tubes[:, chunk_idx[0] - half_T:chunk_idx[0] + half_T + 1].contiguous().view(-1, 5)[:, 1:], first_loc.view(-1, 4)) pred_first_loc = pred_first_loc.view( first_loc.size()) # [N*T, 4*C] --> [N, T, 4*C] pred_last_loc = decode_coef( flat_tubes[:, chunk_idx[-1] - half_T:chunk_idx[-1] + half_T + 1].contiguous().view(-1, 5)[:, 1:], last_loc.view(-1, 4)) pred_last_loc = pred_last_loc.view( last_loc.size()) # [N*T, 4*C] --> [N, T, 4*C] history.append({ 'pred_prob': pred_prob.data, 'pred_loc': pred_loc.data, 'pred_first_loc': pred_first_loc.data if args.temporal_mode == "predict" else None, 'pred_last_loc': pred_last_loc.data if args.temporal_mode == "predict" else None, 'tubes_nums': tubes_nums }) # loop for each batch cur_trajectory = [] selected_tubes = [] tubes_count = 0 for b in range(len(tubes_nums)): seq_start = tubes_count tubes_count = tubes_count + tubes_nums[b] cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]] cur_pred_tubes = pred_loc[seq_start:seq_start + tubes_nums[b]] cur_pred_class = torch.argmax(cur_pred_prob, dim=-1) # check whether extending tubes is needed if i < args.max_iter and args.NUM_CHUNKS[ i + 1] == args.NUM_CHUNKS[i] + 2: # check which method to extend tubes if args.temporal_mode == "predict": cur_first_tubes = pred_first_loc[seq_start:seq_start + tubes_nums[b]] cur_last_tubes = pred_last_loc[seq_start:seq_start + tubes_nums[b]] cur_proposals = torch.cat( [cur_first_tubes, cur_pred_tubes, cur_last_tubes], dim=1) # concatenate along time axis cur_proposals = cur_proposals.cpu().numpy() elif args.temporal_mode == "extrapolate": # expand tubes along temporal axis with extrapolation cur_proposals = cur_pred_tubes.cpu().numpy() cur_proposals = extrapolate_tubes(cur_proposals, args.T) else: # mean tubes cur_proposals = cur_pred_tubes.cpu().numpy() mean_tubes = np.mean(cur_proposals, axis=1, keepdims=True) mean_tubes = np.tile(mean_tubes, (1, args.T, 1)) cur_proposals = np.concatenate( (mean_tubes, cur_proposals, mean_tubes), axis=1) else: cur_proposals = cur_pred_tubes.cpu().numpy() cur_proposals = valid_tubes(cur_proposals, width=args.image_size[0], height=args.image_size[1]) cur_trajectory.append((cur_proposals, cur_pred_class)) selected_tubes.append(cur_proposals) trajectory.append(cur_trajectory) # flatten list of tubes flat_tubes, tubes_nums = flatten_tubes( selected_tubes, batch_idx=True) # add batch_idx for ROI pooling flat_tubes = torch.FloatTensor(flat_tubes).to(conv_feat) return history, trajectory