def stoa_track(idx, frame_counter, img, gt_bbox, tracker1, template_dir=None, img_names=None): cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] lost_number = 0 if idx == frame_counter: init_gt = gt_bbox_ if template_dir is not None: img = cv2.imread(template_dir) tracker1.init(img, gt_bbox_) # pred_bboxes.append(1) if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: append = 1 else: append = gt_bbox_ elif idx > frame_counter: if img_names is not None: img = cv2.imread(img_names[idx]) outputs = tracker1.track(img, idx=idx) # print('****************** state of the art tracking ******************') append = outputs['bbox'] overlap = vot_overlap(outputs['bbox'], gt_bbox, (img.shape[1], img.shape[0])) if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: if overlap > 0: # not lost lost = False else: # lost object append = 2 frame_counter = idx + 5 # skip 5 frames lost_number = 1 lost = True else: if overlap <= 0: lost_number = 1 else: append = 0 return append, lost_number, frame_counter
def stoa_track(idx, frame_counter, img, gt_bbox, tracker1): cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] lost_number = 0 if idx == frame_counter: init_gt = gt_bbox_ tracker1.init(img, gt_bbox_) # pred_bboxes.append(1) append = 1 elif idx > frame_counter: outputs = tracker1.track(img, idx=idx) # print('****************** state of the art tracking ******************') append = outputs['bbox'] overlap = vot_overlap(outputs['bbox'], gt_bbox, (img.shape[1], img.shape[0])) if args.dataset != 'OTB100': if overlap > 0: # not lost lost = False else: # lost object append = 2 frame_counter = idx + 5 # skip 5 frames lost_number = 1 lost = True else: if overlap <= 0: lost_number = 1 else: append = 0 return append, lost_number, frame_counter
def test_update(cfg, dirmanager): print(cfg["TEST"]["DATASET"] + ' stage' + str(cfg["TEMPLATE"]["STEP"])) # load config # if cfg["TEST"]["DATASET"] == 'UAV123': # dataset_root = '/home/lyuyu/dataset/UAV123/data_seq/UAV123/' # else: dataset_root = '/home/lyuyu/dataset/' + cfg["TEST"]["DATASET"] # tracker model_path = cfg["MODEL"]["CHECKPOINT_PATH"] torch.cuda.set_device(cfg["TEST"]["GPU_ID"]) # load tracker and updatenet tracker = tracker_builder.build_tracker(cfg) # update_path='./updatenet/checkpoint/checkpoint40.pth.tar' update_path = cfg["UPDATE"]["CHECKPOINT_PATH"] step = cfg["TEST"]["TYPE"] gpu_id = cfg["TEST"]["GPU_ID"] if cfg["UPDATE"]["MODEL"][:8] == "AAUNetv2": tracker = SiamTrackerAAUNetv2(cfg, tracker, update_path, gpu_id, step) elif cfg["UPDATE"]["MODEL"] == "UpdateNet": tracker = SiamTrackerUpdateNet( cfg, tracker, update_path, gpu_id, step) #1=dasiamrpn; 2 linear; 3 updatenet else: raise NotImplementedError # create dataset dataset = DatasetFactory.create_dataset(name=cfg["TEST"]["DATASET"], dataset_root=dataset_root, load_img=False) # model_name = tracker.name model_name = update_path[63:-7].replace('/', '').replace('.', '') if step == 4: model_name = 'updatenet2016' elif step == 1: model_name = 'dasiamrpn' if cfg["TEST"]["DATASET"] in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking total_lost = 0 #for v_idx, video in enumerate(dataset): if cfg["TEST"]["CLS_TYPE"] != 0: total_success_list = [] total_iou_list = [] for video in tqdm(dataset): # if args.video != '': # # test one special video # if video.name != args.video: # continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] if cfg["TEST"]["CLS_TYPE"] != 0: iou_list = [] success_list = [] state = dict() for idx, (img, gt_bbox) in enumerate(video): # print(idx) if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: state = tracker.init(img, np.array(gt_bbox)) cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) pred_bbox = [cx - w / 2, cy - h / 2, w, h] pred_bboxes.append(1) if cfg["TEST"]["CLS_TYPE"] != 0: iou_list.append(1) success_list.append(1) elif idx > frame_counter: # state = tracker.update(img, np.array(gt_bbox)) state = tracker.update(img) pos = state['target_pos'] # cx, cy sz = state['target_sz'] # w, h pred_bbox = np.array( [pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]]) #pred_bbox=np.array([pos[0]+1-(sz[0]-1)/2, pos[1]+1-(sz[1]-1)/2, sz[0], sz[1]]) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) # iou = overlap_ratio(gt_bbox, pred_bbox) if cfg["TEST"]["CLS_TYPE"] != 0: if cfg["TEST"]["CLS_TYPE"] == 1: if overlap > cfg["UPDATE"]["IOU_THRES"]: iou = 1 else: iou = 0 iou_list.append(iou) success_list.append(state['success']) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: if cfg["TEST"]["CLS_TYPE"] != 0: iou_list.append(0) success_list.append(0) pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if cfg["TEST"]["VISUALIZATION"] and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) if cfg["TEST"]["CLS_TYPE"] != 0: total_success_list = total_success_list + success_list total_iou_list = total_iou_list + iou_list success_list = np.array(success_list) iou_list = np.array(iou_list) # total accuracy & detect failure accuracy accuracy = np.mean(success_list == iou_list) index0 = np.argwhere(iou_list == 0) accuracy0 = np.mean(success_list[index0] == iou_list[index0]) print(video.name, accuracy, accuracy0) toc /= cv2.getTickFrequency() # save results if cfg["SOLVER"]["LR_POLICY"] == 'epochwise_step_group': lr_type = cfg["UPDATE"]["CHECKPOINT_PATH"].split('/')[-2] elif cfg["SOLVER"]["LR_POLICY"] == 'cosine': lr_type = 'cosine' else: lr_type = 'undefined' if cfg["TEST"]["TYPE"] == 1: lr_type = 'base_dasiamrpn' video_path = os.path.join(dirmanager.updmod_res_dir, cfg["TEST"]["DATASET"], lr_type, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( # v_idx+1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number if cfg["TEST"]["CLS_TYPE"] == 1: total_success_list = np.array(total_success_list) total_iou_list = np.array(total_iou_list) # total accuracy & detect failure accuracy accuracy = np.mean(total_success_list == total_iou_list) index0 = np.argwhere(total_iou_list == 0) accuracy0 = np.mean( total_success_list[index0] == total_iou_list[index0]) print('total accuracy', accuracy, accuracy0) # print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking #for v_idx, video in enumerate(dataset): if cfg["TEST"]["CLS_TYPE"] != 0: total_success_list = [] total_iou_list = [] for video in tqdm(dataset): # if args.video != '': # # test one special video # if video.name != args.video: # continue toc = 0 pred_bboxes = [] if cfg["TEST"]["CLS_TYPE"] != 0: iou_list = [] success_list = [] scores = [] track_times = [] state = dict() for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: state = tracker.init( img, np.array(gt_bbox)) #注意gt_bbox和gt_bbox_的区别 cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) pred_bbox = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] scores.append(None) if 'VOT2018-LT' == cfg["TEST"]["DATASET"]: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) if cfg["TEST"]["CLS_TYPE"] != 0: iou_list.append(1) success_list.append(1) # if video.name == 'Jogging-1': # template_vis(state['z_f_cur'], 0, 'template_vis_'+str(idx)) else: state = tracker.update(img) # if video.name == 'Jogging-1': # template_vis(state['z_f_cur'], 0, 'template_vis_'+str(idx)) pos = state['target_pos'] sz = state['target_sz'] pred_bbox = np.array( [pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]]) pred_bboxes.append(pred_bbox) #scores.append(outputs['best_score']) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if cfg["TEST"]["CLS_TYPE"] != 0: if cfg["TEST"]["CLS_TYPE"] == 1: if overlap > 0.1: iou = 1 else: iou = 0 if cfg["TEST"]["CLS_TYPE"] == 2: iou = overlap iou_list.append(iou) success_list.append(state['success'].cpu().numpy()) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if cfg["TEST"]["VISUALIZATION"] and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) if cfg["TEST"]["CLS_TYPE"] != 0: total_success_list = total_success_list + success_list total_iou_list = total_iou_list + iou_list success_list = np.array(success_list) iou_list = np.array(iou_list) if cfg["TEST"]["CLS_TYPE"] == 1: # total accuracy & detect failure accuracy accuracy = np.mean(success_list == iou_list) index0 = np.argwhere(iou_list == 0) index1 = np.argwhere(iou_list == 1) accuracy0 = np.mean( success_list[index0] == iou_list[index0]) accuracy1 = np.mean( success_list[index1] == iou_list[index1]) if cfg["TEST"]["CLS_TYPE"] == 2: # total accuracy & detect failure accuracy comp_list = abs(success_list - iou_list) < 0.2 accuracy = np.mean(comp_list) index0 = np.argwhere((success_list - iou_list) > 0) index1 = np.argwhere((iou_list - success_list) > 0) accuracy0 = np.mean(comp_list[index0]) accuracy1 = np.mean(comp_list[index1]) print(video.name, accuracy, accuracy0, accuracy1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == cfg["TEST"]["DATASET"]: video_path = os.path.join('results', cfg["TEST"]["DATASET"], model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == cfg["TEST"]["DATASET"]: video_path = os.path.join('results', cfg["TEST"]["DATASET"], model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: lr_type = 'cosine' video_path = os.path.join(dirmanager.updmod_res_dir, cfg["TEST"]["DATASET"], lr_type, model_name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') class_path = os.path.join(video_path, '{}_cls.txt'.format(video.name)) with open(class_path, 'w') as f: for x in success_list: f.write(str(x) + '\n') # print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( # v_idx+1, video.name, toc, idx / toc)) total_success_list = np.array(total_success_list) total_iou_list = np.array(total_iou_list) # total accuracy & detect failure accuracy if cfg["TEST"]["CLS_TYPE"] == 1: accuracy = np.mean(total_success_list == total_iou_list) index0 = np.argwhere(total_iou_list == 0) accuracy0 = np.mean( total_success_list[index0] == total_iou_list[index0]) if cfg["TEST"]["CLS_TYPE"] == 2: comp_list = abs(total_success_list - total_iou_list) < 0.2 accuracy = np.mean(comp_list) index0 = np.argwhere((total_success_list - total_iou_list) > 0) index1 = np.argwhere((total_iou_list - total_success_list) > 0) accuracy0 = np.mean(comp_list[index0]) accuracy1 = np.mean(comp_list[index1]) print('total accuracy', accuracy, accuracy0) # evaluation(cfg["TEST"]["DATASET"], model_name, dirmanager.updmod_res_dir) evaluation( cfg["TEST"]["DATASET"], model_name, os.path.join(dirmanager.updmod_res_dir, cfg["TEST"]["DATASET"], lr_type)) return
def track(video, tracker, visualize=False, data_collector=None): num_frames = len(video) frame_counter = 0 frame_reset = 0 # used to indicate how many times the tracker was re-initialized lost_times = 0 pred_bboxes = [ ] # Filled according to VOT protocol & used for metric calculation total_time = 0 zero_tensor = torch.zeros(cfg.REFINE_TEMPLATE.FEATURE_SIZE, dtype=torch.float32).cpu().data for f, (im, gt) in enumerate(video): if len(gt) == 4: gt = bbox_to_polygon(gt) cx, cy, w, h = get_axis_aligned_bbox(np.array(gt)) start_time = cv2.getTickCount() if f == frame_counter: # Init or reset after lost frame gt_bbox = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(im, gt_bbox) pred_bbox = gt_bbox pred_bboxes.append(1) total_time += cv2.getTickCount() - start_time frame_reset = 0 if data_collector: data_collector.add_init(f, num_frames, init_feat=tracker.zf_init) elif f > frame_counter: # Tracking frame_reset += 1 outputs = tracker.track(im) pred_bbox = outputs['bbox'] if data_collector: # Extract ground-truth template features gt_rect = np.array([cx, cy, w, h]) gt_zf = tracker.extract_template( im, gt_rect) if w * h != 0 else zero_tensor data_collector.add_tracking(f, num_frames, frame_reset, cur_feat=outputs['zf'], pre_feat=tracker.zf, gt_feat=gt_zf) overlap = vot_overlap(pred_bbox, gt, (im.shape[1], im.shape[0])) if overlap > 0: pred_bboxes.append(pred_bbox) else: pred_bboxes.append(2) # skip 5 frames after object lost (as suggested by VOT) frame_counter = f + 5 lost_times += 1 total_time += cv2.getTickCount() - start_time elif f < frame_counter or w * h == 0: # Skipping pred_bboxes.append(0) total_time += cv2.getTickCount() - start_time frame_reset = 0 if data_collector: data_collector.add_init(f, num_frames, zero_tensor) if visualize: cv2.polylines(im, [np.array(gt, np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) bbox = list(map(int, pred_bbox)) cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(im, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(im, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) window_name = 'test' cv2.imshow(window_name, im) cv2.moveWindow(window_name, 100, 10) cv2.waitKey(1) total_time /= cv2.getTickFrequency() cv2.destroyAllWindows() return { 'pred_bboxes': pred_bboxes, 'lost_times': lost_times, 'total_time': total_time, 'fps': f / total_time, }
def main(): is_gpu_cuda_available = torch.cuda.is_available() if not is_gpu_cuda_available: raise RuntimeError( 'Failed to locate a CUDA GPU. Program cannot continue..') num_gpus = torch.cuda.device_count() gpu_type = torch.cuda.get_device_name(0) print(f"You have {num_gpus} available of type: {gpu_type}") print("This might take a few minutes...Grab a cup of coffee\n") # load config cfg.merge_from_file(args.config) dataset_root = os.path.join(args.dataset_directory, args.dataset) print(f"dataset root-->{dataset_root}") # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.model_name print(f"Model name is {model_name}") total_lost = 0 if args.dataset in vot_like_dataset: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0.85: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + args.skip_frames # skip 1 frame lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results save_path = os.path.join(args.results_path, args.dataset, model_name, args.experiment_name, video.name) if not os.path.isdir(save_path): os.makedirs(save_path) result_path = os.path.join(save_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') with open(os.path.join(save_path, '..', 'lost.txt'), 'a+') as f: f.write( f"{v_idx+1} Class: {video.name} | Time: {toc}s | Speed: {idx/toc}fps | Lost:{lost_number} \n" ) print( '({:3d}) Class: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) with open(os.path.join(save_path, '..', 'lost.txt'), 'a+') as f: f.write( f"Model architeture used --> {model_name} \ntotal lost: {total_lost} \n" ) f.write(f"SKIP FRAMES USED --> {args.skip_frames}") else: # OPE tracking # will be implemented if needed in future pass
def main(): # load config dataset_root='/home/ubuntu/pytorch/pytorch-tracking/DaSiamRPN/datasets/'+args.dataset # tracker model_path ='./models/SiamRPNBIG.model' name='DaSiamRPN' gpu_id=0 #这里改成1不能正常运行 tracker = SiamRPNTracker(model_path,gpu_id) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) #算法的名字 model_name = name if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking total_lost=0 #for v_idx, video in enumerate(dataset): for video in tqdm(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] state=dict() for idx, (img, gt_bbox) in enumerate(video): # print(idx) if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: state=tracker.init(img, np.array(gt_bbox))#注意gt_bbox和gt_bbox_的区别 cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #1-based pred_bbox = [cx-(w)/2, cy-(h)/2, w, h]#1-based pred_bboxes.append(1) elif idx > frame_counter: state = tracker.update(img) pos=state['target_pos'] sz=state['target_sz'] pred_bbox=np.array([pos[0]-sz[0]/2, pos[1]-sz[1]/2, sz[0], sz[1]]) #pred_bbox=np.array([pos[0]+1-(sz[0]-1)/2, pos[1]+1-(sz[1]-1)/2, sz[0], sz[1]]) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( # v_idx+1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number # print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking #for v_idx, video in enumerate(dataset): for video in tqdm(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] state=dict() for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: state=tracker.init(img, np.array(gt_bbox))#注意gt_bbox和gt_bbox_的区别 cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) pred_bbox = [cx-(w-1)/2, cy-(h-1)/2, w, h] scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: state = tracker.update(img) pos=state['target_pos'] sz=state['target_sz'] pred_bbox=np.array([pos[0]-sz[0]/2, pos[1]-sz[1]/2, sz[0], sz[1]]) pred_bboxes.append(pred_bbox) #scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n')
def main(): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(args.dataset_dir, args.dataset) epsilon = args.epsilon # create model track_model1 = ModelBuilder() track_model2 = ModelBuilder() lr = args.lr # load model track_model1 = load_pretrain(track_model1, args.snapshot).cuda().eval() track_model2 = load_pretrain(track_model2, args.snapshot).cuda().eval() # build tracker tracker1 = build_tracker(track_model1) tracker2 = build_tracker(track_model2) attacker = ModelAttacker().cuda().train() optimizer = optim.Adam(attacker.parameters(), lr=lr) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False, dataset_toolkit='oneshot', config=cfg) # # vid.name = {'ants1','ants3',....} # img, bbox, cls, delta, delta_weight # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4] model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 n_epochs = args.epochs for name, param in tracker1.model.named_parameters(): param.requires_grad_(False) for name, param in tracker2.model.named_parameters(): param.requires_grad_(False) # for name, param in tracker2.model.named_parameters(): # if 'backbone' in name or 'neck' in name or 'rpn_head' in name: # param.requires_grad_(False) # elif param.requires_grad: # param.requires_grad_(True) # # print(name, param.data) # print('grad true ', name) # else: # print('grad false ', name) if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019', 'OTB100']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue else: if not os.path.exists( os.path.join(args.savedir, video.name)): os.mkdir(os.path.join(args.savedir, video.name)) # set writing video parameters height, width, channels = video[0][0].shape out = cv2.VideoWriter( os.path.join(args.savedir, video.name + '.avi'), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15, (width, height)) frame_counter = 0 frame_counter_adv = 0 lost_number = 0 lost_number_adv = 0 toc = 0 total_toc = 0 pred_bboxes = [] pred_bboxes_adv = [] lost = False lost_adv = False for i in range(0, args.epochs): for idx, (img, gt_bbox) in enumerate(video): # if len(gt_bbox) == 4: # gt_bbox = [gt_bbox[0], gt_bbox[1], # gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, # gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, # gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]] cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] ########################################## # # # for state of the art tracking # # # ########################################## if i == 0: if idx == frame_counter: init_gt = gt_bbox_ tracker1.init(img, gt_bbox_) pred_bboxes.append(1) zf = tracker1.zf img1 = img.copy() elif idx > frame_counter: outputs = tracker1.track(img, idx=idx) # print('****************** state of the art tracking ******************') pred_bbox = outputs['bbox'] if args.dataset != 'OTB100': overlap = vot_overlap( pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) lost = False else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 lost = True else: pred_bboxes.append(pred_bbox) else: pred_bboxes.append(0) tic = cv2.getTickCount() ########################################## # # # # # adversarial tracking # # # # # ########################################## if idx == frame_counter_adv: zimg = img.copy() sz, bbox, pad = tracker2.init(img, gt_bbox_, attacker=attacker, epsilon=args.epsilon) pred_bboxes_adv.append(1) zf2 = tracker2.zf # cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg'), img) elif idx > frame_counter_adv: _outputs = tracker2.track(img, attacker=attacker, epsilon=args.epsilon, zf=zf2, idx=idx, iter=i) # print(_outputs['best_score'], outputs['target_score']) # ad_bbox = _outputs['bbox'] # ad_overlap = vot_overlap(ad_bbox, gt_bbox, (img.shape[1], img.shape[0])) # filename = os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg') # save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox) filename = os.path.join( args.savedir, video.name, 'bb' + str(idx).zfill(6) + '.jpg') save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox) # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad, # os.path.join(args.savedir, video.name, str(idx).zfill(6) + '.jpg'), save=True) # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad, os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg'), save=True) # update state tracker2.center_pos = _outputs['center_pos'] tracker2.size = _outputs['size'] ad_overlap = vot_overlap(ad_bbox, gt_bbox, (img.shape[1], img.shape[0])) if args.dataset != 'OTB100': if ad_overlap > 0: # not lost pred_bboxes_adv.append(ad_bbox) lost_adv = False else: # lost object pred_bboxes_adv.append(2) frame_counter_adv = idx + 5 # skip 5 frames lost_number_adv += 1 lost_adv = True else: if ad_overlap <= 0: lost_number_adv += 1 pred_bboxes_adv.append(ad_bbox) else: pred_bboxes_adv.append(0) toc += cv2.getTickCount() - tic # if idx > frame_counter_adv and not lost_adv: # ad_bbox = list(map(int, ad_bbox)) # cv2.rectangle(img, (ad_bbox[0], ad_bbox[1]), # (ad_bbox[0] + ad_bbox[2], ad_bbox[1] + ad_bbox[3]), (0, 0, 255), 3) # # if idx > frame_counter and not lost: # bbox = list(map(int, pred_bbox)) # cv2.rectangle(img, (bbox[0], bbox[1]), # (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) # # __gt_bbox = list(map(int, gt_bbox_)) # cv2.rectangle(img, (__gt_bbox[0], __gt_bbox[1]), # (__gt_bbox[0]+__gt_bbox[2], __gt_bbox[1]+__gt_bbox[3]), (0, 0, 0), 3) # # cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2) # cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) # cv2.putText(img, ","+str(lost_number_adv), (80, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) # # out.write(img) # print('frame {}/{} Lost: {:d}'.format(idx, len(video), lost_number_adv)) toc /= cv2.getTickFrequency() print( '({:3d}) Video: {:12s} train{}/{} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, i, args.epochs, toc, idx / toc, lost_number_adv)) total_toc += toc l1 = _outputs['l1'] l2 = _outputs['l2'] l3 = _outputs['l3'] # total_loss = 0.8 * l1 + 0.4 * l2 + 1.2 * l3 total_loss = l1 + 0.4 * l2 # total_loss = l1 + 0.4 * l2 # print(idx, i, total_loss.item(), _outputs['center_pos'], _outputs['size']) # if ad_overlap < 0.5: if _outputs['best_score'] < outputs['target_score']: total_loss_val = 0 # print(idx, i, ad_overlap) # print(ad_bbox) # print(pred_bbox) # print('------------------------') # filename = os.path.join(args.savedir, video.name, 'bb' + str(idx).zfill(6) + '.jpg') # save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox) # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad, # os.path.join(args.savedir, video.name, str(idx).zfill(6) + '.jpg'), save=True) # pdb.set_trace() break else: # print(_outputs['bbox']) optimizer.zero_grad() total_loss.backward(retain_graph=True) optimizer.step() total_toc /= (i + 1) # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time Average: {:4.1f}s Speed Average: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, total_toc, idx / total_toc, lost_number_adv)) total_lost += lost_number_adv print("{:s} total lost: {:d}".format(model_name, total_lost))
def objective(trial): # different params cfg.TRACK.WINDOW_INFLUENCE = trial.suggest_uniform('window_influence', 0.050, 0.650) cfg.TRACK.PENALTY_K = trial.suggest_uniform('penalty_k', 0.000, 0.600) cfg.TRACK.LR = trial.suggest_uniform('scale_lr', 0.100, 0.800) cfg.TRACK.COEE_CLASS = trial.suggest_uniform('coee_class', 0.01, 0.999) # rebuild tracker info = edict() info.arch = args.arch info.cls_type = args.cls_type info.dataset = args.dataset tracker = SiamRPN(info) model_name = args.snapshot.split('/')[-1].split('.')[0] tracker_name = os.path.join('tune_results', args.dataset, model_name, model_name + \ '_wi-{:.3f}'.format(cfg.TRACK.WINDOW_INFLUENCE) + \ '_pk-{:.3f}'.format(cfg.TRACK.PENALTY_K) + \ '_lr-{:.3f}'.format(cfg.TRACK.LR)+\ '_ce-{:.3f}'.format(cfg.TRACK.COEE_CLASS) ) total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): # if len(gt_bbox) == 4: # gt_bbox = [gt_bbox[0], gt_bbox[1], # gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, # gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, # gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) state["arch"] = args.arch # pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bbox = location overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() toc /= cv2.getTickFrequency() # save results video_path = os.path.join(tracker_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) eao = eval(dataset=dataset_eval, tracker_name=tracker_name) info = "{:s} window_influence: {:1.17f}, penalty_k: {:1.17f}, scale_lr: {:1.17f}, EAO: {:1.3f}".format( model_name, cfg.TRACK.WINDOW_INFLUENCE, cfg.TRACK.PENALTY_K, cfg.TRACK.LR, eao) logging.getLogger().info(info) print(info) return eao else: # OPE tracking for v_idx, video in enumerate(dataset): toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch # tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bbox = location # outputs = tracker.track(img) # pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(state['score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: if not os.path.isdir(tracker_name): os.makedirs(tracker_name) result_path = os.path.join(tracker_name, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc)) auc = eval(dataset=dataset_eval, tracker_name=tracker_name) info = "{:s} window_influence: {:1.17f}, penalty_k: {:1.17f}, scale_lr: {:1.17f}, AUC: {:1.3f}".format( model_name, cfg.TRACK.WINDOW_INFLUENCE, cfg.TRACK.PENALTY_K, cfg.TRACK.LR, auc) logging.getLogger().info(info) print(info) return auc
def main(): # load config cfg_from_file(args.config) dataset_root = os.path.join('dataset', args.dataset) # create model net = ModelBuilder() checkpoint = torch.load(args.model) if 'state_dict' in checkpoint: net.load_state_dict(checkpoint['state_dict']) else: net.load_state_dict(checkpoint) net.cuda().eval() # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.save_name total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(1) elif idx > frame_counter: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_polygon = [ pred_bbox[0], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3], pred_bbox[0], pred_bbox[1] + pred_bbox[3] ] overlap = vot_overlap(gt_bbox, pred_polygon, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() # save results video_path = os.path.join('result', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: if 'OTB' in args.dataset: target_pos, target_sz = rect1_2_cxy_wh(gt_bbox) else: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy ]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) if 'OTB' in args.dataset: pred_bbox = cxy_wh_2_rect1(state['target_pos'], state['target_sz']) else: pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) else: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() if 'GOT-10k' == args.dataset: video_path = os.path.join('result', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(args.dataset_dir, args.dataset) epsilon = args.epsilon # create model model = Steath([1, 3, 255, 255]) track_model = ModelBuilder() lr = args.lr # load model model = load_pretrain(model, args.snapshot).cuda() track_model = load_pretrain(track_model, args.snapshot).cuda().eval() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) model.train() # model.dx.requires_grad_(True) # model.backbone.eval() # if cfg.ADJUST.ADJUST: # model.neck.eval() # model.rpn_head.eval() for name, param in model.named_parameters(): if 'backbone' in name or 'neck' in name or 'rpn_head' in name: param.requires_grad_(False) elif param.requires_grad: param.requires_grad_(True) print(name, param.data) else: print(name) clipper = WeightClipper(5) # build tracker tracker1 = build_tracker(track_model) tracker2 = build_tracker(track_model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False, config=cfg) # # vid.name = {'ants1','ants3',....} # img, bbox, cls, delta, delta_weight # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4] model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 n_epochs = args.epochs if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue else: if not os.path.exists(os.path.join(args.savedir, video.name)): os.mkdir(os.path.join(args.savedir, video.name)) # set writing video parameters height, width, channels = video[0][0].shape out = cv2.VideoWriter(os.path.join(args.savedir, video.name + '.avi'), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15, (width, height)) frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] data = {'template': None, 'search': None} for idx, (img, gt_bbox, z, x, szx, boxx, padx, cls, delta, delta_w, overlap, _bbox, _bbox_p) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-w//2, cy-h//2, w, h] if idx == frame_counter: tracker1.init(img, gt_bbox_) tracker2.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) data['template'] = torch.autograd.Variable(z, requires_grad=True).cuda() elif idx > frame_counter: prim_img = np.copy(img) data['search'] = torch.autograd.Variable(x, requires_grad=True).cuda() data['label_cls'] = torch.Tensor(cls).type(torch.LongTensor).cuda() data['label_loc'] = torch.Tensor(delta).type(torch.FloatTensor).cuda() data['label_loc_weight'] = torch.Tensor(delta_w).cuda() diff = data['search'] for epoch in range(n_epochs): outputs = model(data, epsilon) cls_loss = outputs['cls_loss'] # print(idx, epoch, cls_loss.item()) loc_loss = outputs['loc_loss'] total_loss = outputs['total_loss'] print('{}/{} cls={}, loc={}, total={}'.format(idx, len(video), cls_loss.item(), loc_loss.item(), total_loss.item())) optimizer.zero_grad() # cls_loss.backward() total_loss.backward() # model.apply(clipper) optimizer.step() # print('loss ', loss(diff, outputs['search']).item()) # diff = outputs['search'] # print(epoch, cls_loss, loc_loss, total_loss) # print('{}/{} cls={}, loc={}, total={}'.format(idx, len(video), cls_loss.item(), loc_loss.item(), # total_loss.item())) perturb_data = outputs['search'] # cv2.rectangle(img, (int(cx-w/2+1), int(cy-h/2+1)), (int(cx+w/2+1), int(cy+h/2+1)), (0, 0, 0), 3) # cv2.imwrite(os.path.join(args.savedir, video.name, 'original_' + str(idx).zfill(7) + '.jpg'), img) # _img = perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0]) # cv2.imwrite(os.path.join(args.savedir, 'perturb_' + str(idx) + '.jpg'), _img) szx = int(szx) if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, szx): perturb_data = F.interpolate(perturb_data, size=szx) __bbox = (np.array(_bbox_p)*szx/cfg.TRACK.INSTANCE_SIZE).astype(np.int) _img = cv2.UMat(perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])).get() cv2.rectangle(_img, (__bbox[0], __bbox[1]), (__bbox[2], __bbox[3]), (0, 0, 0), 3) cv2.imwrite(os.path.join(args.savedir, video.name, 'crop_full_' + str(idx) + '.jpg'), _img) nh, nw, _ = _img.shape __bbox0 = np.zeros_like(__bbox) __bbox0[:4:2] = __bbox[:4:2] - padx[0] __bbox0[1:4:2] = __bbox[1:4:2] - padx[2] img[boxx[0]:boxx[1] + 1, boxx[2]:boxx[3] + 1, :] = \ _img[boxx[0]+padx[0]:boxx[1]+padx[0] + 1, 0 + padx[2]:boxx[3] - boxx[2] + padx[2] + 1, :] # cv2.imwrite(os.path.join(args.savedir, video.name, 'perturb_full_' + str(idx) + '.jpg'), img) # if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, sz): # perturb_data = F.interpolate(perturb_data, size=sz) # __bbox = (np.array(_bbox)*sz/cfg.TRACK.INSTANCE_SIZE).astype(np.uint8) # # _img = cv2.UMat(perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])).get() # cv2.rectangle(_img, (__bbox[0], __bbox[1]), (__bbox[2], __bbox[3]), (0, 0, 0), 3) # cv2.imwrite(os.path.join(args.savedir, video.name, 'crop_full_' + str(idx) + '.jpg'), _img) # # nh, nw, _ = _img.shape # img[bT:bB+1, bL:bR+1, :] = _img[pad[0]:nh - pad[1], pad[2]:nw - pad[3], :] # cv2.imwrite(os.path.join(args.savedir, video.name, 'perturb_full_' + str(idx) + '.jpg'), img) # nimg, sz, box, pad = tracker2.crop(img, bbox=gt_bbox_, im_name='search' + str(idx)) outputs = tracker1.track(img) prim_outputs = tracker2.track(prim_img) pred_bbox = outputs['bbox'] prim_box = prim_outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) # cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(7) + '.jpg'), img) toc += cv2.getTickCount() - tic # write ground truth bbox cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (255, 255, 255), 3) if idx != frame_counter: bbox = list(map(int, pred_bbox)) prim_bbox = list(map(int, prim_box)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.rectangle(img, (prim_bbox[0], prim_bbox[1]), (prim_bbox[0] + prim_bbox[2], prim_bbox[1] + prim_bbox[3]), (0, 0, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) out.write(img) cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(7) + '.jpg'), img) # import pdb # pdb.set_trace() toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( v_idx+1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( v_idx+1, video.name, toc, idx / toc))
def vot_evaluate(dataset, tracker): tracker_name = args.tracker backbone_name = args.cfg.split('/')[-1].split('_')[0] snapshot_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 for v_idx, video in enumerate(dataset): if args.video != '': # if test special video if video.name != args.video: continue frame_count = 0 lost_number = 0 pred_bboxes = [] toc = 0 for idx, (frame, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == frame_count: tracker.init(frame, gt_bbox) # cx,cy,w,h pred_bboxes.append(1) elif idx > frame_count: track_result = tracker.track(frame) bbox = track_result['bbox'] # cx,cy,w,h score = track_result['score'] bbox_ = [ bbox[0] - bbox[2] / 2, bbox[1] - bbox[3] / 2, bbox[2], bbox[3] ] # x,y,w,h gt_bbox_ = [ gt_bbox[0] - (gt_bbox[2] - 1) / 2, gt_bbox[1] - (gt_bbox[3] - 1) / 2, gt_bbox[2], gt_bbox[3] ] overlap = vot_overlap(bbox_, gt_bbox_, (frame.shape[1], frame.shape[0])) # print('idx: {}\n pred: {}\n gt: {}\n overlap: {}\n'.format(idx, bbox_, gt_bbox_, overlap)) if overlap > 0: pred_bboxes.append(bbox_) else: # print('lost idx: {}'.format(idx)) pred_bboxes.append(2) frame_count = idx + 5 lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if args.vis and idx > frame_count: show_double_bbox(frame, bbox, score, gt_bbox, idx, lost_number) toc /= cv2.getTickFrequency() result_dir = os.path.join(cfg.TRACK.RESULT_DIR, args.dataset, tracker_name, backbone_name, snapshot_name) if not os.path.isdir(result_dir): os.makedirs(result_dir) result_path = '{}/{}.txt'.format(result_dir, video.name) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write('{:d}\n'.format(x)) else: f.write(','.join(['{:.4f}'.format(i) for i in x]) + '\n') # log total_lost += lost_number print('[{:d}/{:d}] | video: {:12s} | time: {:4.1f}s | speed: {:3.1f}fps | lost_number: {:d} ' \ .format(v_idx + 1, len(dataset), video.name, toc, idx / toc, lost_number)) print('total_lost: {}'.format(total_lost))
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpuid) snap_shot = './checkpoints/model0_e19.pth' # cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(cur_dir, '../../pysot/testing_dataset', args.dataset) #'LaSOT')# # create model model = ManModelBuilder(out_ch=1024, relu=True).cuda() # load model model = load_pretrain(model, snap_shot) torch.set_grad_enabled(False) # build tracker tracker = ManTracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=True) model_name = snap_shot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video and args.video != '%d' % v_idx: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_min_max_bbox(np.array(gt_bbox)) # get_axis_aligned_bbox gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video and args.video != '%d' % v_idx: continue if 'LaSOT' in args.dataset: model_path = os.path.join('results', args.dataset, model_name) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) if os.path.exists(result_path): print("pass " + video.name) continue toc = 0 pred_bboxes = [] scores = [] track_times = [] video.load_img() for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - w / 2, cy - h/ 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: if not np.isnan(gt_bbox).any(): gt_bbox = list(map(int, gt_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 0, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, cv2.resize(img, (480, 360))) cv2.waitKey(1) toc /= cv2.getTickFrequency() video.free_img() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name)#, '{}'.format(video.name)) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) # result_path = os.path.join(model_path, '{}_001.txt'.format(video.name)) # model_path2 = os.path.join('results', args.dataset, model_name) # shutil.copyfile(result_path, result_path2) # if os.path.exists(result_path2): # print("success copy" + video.name) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( v_idx + 1, video.name, toc, idx / toc))
def run_tracker(tracker, gt, video_name, restart=True): frame_count = 0 lost_number = 0 pred_bboxes = [] toc = 0 if restart: for idx, (frame, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == frame_count: tracker.init(frame, gt_bbox) # cx,cy,w,h pred_bboxes.append(1) elif idx > frame_count: track_result = tracker.track(frame) bbox = track_result['bbox'] # cx,cy,w,h score = track_result['score'] bbox_ = [bbox[0] - bbox[2] / 2, bbox[1] - bbox[3] / 2, bbox[2], bbox[3]] # x,y,w,h gt_bbox_ = [gt_bbox[0] - gt_bbox[2] / 2, gt_bbox[1] - gt_bbox[3] / 2, gt_bbox[2], gt_bbox[3]] if vot_overlap(bbox_, gt_bbox_, (frame.shape[1], frame.shape[0])) > 0: pred_bboxes.append(bbox_) else: pred_bboxes.append(2) frame_count = idx + 5 lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if args.vis and idx > frame_count: show_double_bbox(frame, bbox, score, gt_bbox, idx, lost_number) toc /= cv2.getTickFrequency() # log print('video: {}, time: {:.1f}s, speed: {:.1f}fps, lost_number: {:d} '.format(video_name, toc, idx / toc, lost_number)) return pred_bboxes else: # toc = 0 # pred_bboxes = [] # scores = [] # track_times = [] # for idx, (img, gt_bbox) in enumerate(video): # tic = cv2.getTickCount() # if idx == 0: # cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) # gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] # tracker.init(img, gt_bbox_) # pred_bbox = gt_bbox_ # scores.append(None) # pred_bboxes.append(pred_bbox) # else: # outputs = tracker.track(img) # pred_bbox = outputs['bbox'] # pred_bboxes.append(pred_bbox) # scores.append(outputs['best_score']) # toc += cv2.getTickCount() - tic # track_times.append((cv2.getTickCount() - tic) / cv2.getTickFrequency()) # toc /= cv2.getTickFrequency() # print('Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( # video_name, toc, idx / toc)) # return pred_bboxes, scores, track_times pass
def main(): # refine_method = args.refine_method model_name = 'siamrpn_' + refine_method model_path = '/' snapshot_path = os.path.join( project_path_, 'experiments/%s/model.pth' % args.tracker_name) config_path = os.path.join( project_path_, 'experiments/%s/config.yaml' % args.tracker_name) cfg.merge_from_file(config_path) dataset_root = dataset_root_ # create model '''a model is a Neural Network.(a torch.nn.Module)''' model = ModelBuilder() # load model model = load_pretrain(model, snapshot_path).cuda().eval() # build tracker '''a tracker is a object, which consists of not only a NN but also some post-processing''' tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) '''##### build a refinement module #####''' if 'RF' in refine_method: RF_module = RefineModule(refine_path, selector_path, branches=branches, search_factor=sr, input_sz=input_sz) elif refine_method == 'iou_net': RF_info = Tracker('iou_net', 'iou_net_dimp', None) RF_params = RF_info.get_parameters() RF_params.visualization = False RF_params.debug = False RF_params.visdom_info = { 'use_visdom': False, 'server': '127.0.0.1', 'port': 8097 } RF_module = RF_info.tracker_class(RF_params) elif refine_method == 'mask': RF_module = siammask() else: raise ValueError("refine_method should be 'RF' or 'iou' or 'mask' ") # model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: H, W, _ = img.shape cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) '''##### initilize refinement module for specific video''' if 'RF' in refine_method: RF_module.initialize( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), np.array(gt_bbox_)) elif refine_method == 'iou_net': gt_bbox_np = np.array(gt_bbox_) gt_bbox_torch = torch.from_numpy( gt_bbox_np.astype(np.float32)) init_info = {} init_info['init_bbox'] = gt_bbox_torch RF_module.initialize( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), init_info) elif refine_method == 'mask': RF_module.initialize(img, np.array(gt_bbox_)) else: raise ValueError( "refine_method should be 'RF' or 'RF_mask' or 'iou_net' or 'mask' " ) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] '''##### refine tracking results #####''' if 'RF' in refine_method or refine_method == 'iou_net': pred_bbox = RF_module.refine( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), np.array(pred_bbox)) x1, y1, w, h = pred_bbox.tolist() '''add boundary and min size limit''' x1, y1, x2, y2 = bbox_clip(x1, y1, x1 + w, y1 + h, (H, W)) w = x2 - x1 h = y2 - y1 pred_bbox = np.array([x1, y1, w, h]) '''pass new state back to base tracker''' tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2]) tracker.size = np.array([w, h]) elif refine_method == 'mask': pred_bbox, center_pos, size = RF_module.refine( img, np.array(pred_bbox), VOT=True) # boundary and min size limit have been included in "refine" '''pass new state back to base tracker''' '''pred_bbox is a list with 8 elements''' tracker.center_pos = center_pos tracker.size = size else: raise ValueError( 'refine_method should be RF or iou or mask') overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if refine_method == 'mask': cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join(save_dir, args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if video.name + '.txt' in os.listdir(model_path): continue if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: H, W, _ = img.shape cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) '''##### initilize refinement module for specific video''' if 'RF' in refine_method: RF_module.initialize( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), np.array(gt_bbox_)) elif refine_method == 'iou_net': gt_bbox_np = np.array(gt_bbox_) gt_bbox_torch = torch.from_numpy( gt_bbox_np.astype(np.float32)) init_info = {} init_info['init_bbox'] = gt_bbox_torch RF_module.initialize( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), init_info) elif refine_method == 'mask': RF_module.initialize(img, np.array(gt_bbox_)) else: raise ValueError( "refine_method should be 'RF' or 'iou' or 'mask' ") pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] '''##### refine tracking results #####''' if 'RF' in refine_method or refine_method == 'iou_net': pred_bbox = RF_module.refine( cv2.cvtColor(img, cv2.COLOR_BGR2RGB), np.array(pred_bbox)) elif refine_method == 'mask': pred_bbox = RF_module.refine(img, np.array(pred_bbox), VOT=False) else: raise ValueError( "refine_method should be 'RF' or 'iou' or 'mask' ") x1, y1, w, h = pred_bbox.tolist() '''add boundary and min size limit''' x1, y1, x2, y2 = bbox_clip(x1, y1, x1 + w, y1 + h, (H, W)) w = x2 - x1 h = y2 - y1 pred_bbox = np.array([x1, y1, w, h]) tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2]) tracker.size = np.array([w, h]) pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join(save_dir, args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join(save_dir, args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join( save_dir, args.dataset, model_name + '_' + str(selector_path)) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset) # create model model = Model2021() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracke tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): overlaps1 = [] vars1 = [] vars0 = [] occl1 = [] if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] frame_width = 960#img.shape[1] frame_height = 540#img.shape[0] video_loc = os.path.join('../results', model_name, video.name) out = cv2.VideoWriter(video_loc+'.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height),True) if video.tags['occlusion']==[] or (np.array(video.tags['occlusion'])==1).sum()==0: print("\t\tdiscard occlusion") continue video.tags['occlusion'] = video.tags['all'] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] box1 = gt_bbox_ tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) if idx == 0: print(img.shape) elif idx > frame_counter: outputs = tracker.track(img, mode) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) ####################################################################################### cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] box2 = gt_bbox_ w1, h1 = box1[2], box1[3] w2, h2 = box2[2], box2[3] cx1, cy1 = (img.shape[1]//2, img.shape[0]//2) cx2, cy2 = (box2[2]/2+box2[0], box2[3]/2+box2[1]) # box1 = box2 # scale variation s1 = np.sqrt(w1*h1) s2 = np.sqrt(w2*h2) sv = max(s1/s2, s2/s1) # aspect ratio variation r1, r2 = h1/w1, h2/w2 arv = max(r1/r2, r2/r1) # fast motion fm = np.sqrt((cx2-cx1)**2+(cy2-cy1)**2)/np.sqrt(s1*s2) vars0.append(np.array([sv, arv, fm, outputs['cls2']])) # occlusion ######################################################################################### # print(idx, outputs['var'], np.array([sv, arv, fm])) ################################## overlaps1.append(overlap) vars1.append(outputs['cls2']) if idx<=len(video.tags['occlusion']): occl1.append(video.tags['occlusion'][idx]) else: occl1.append(np.zeros(idx-len(video.tags['occlusion']))) if overlap > 0.0: # not lost pred_bboxes.append(pred_bbox) else: # lost object # print("-------loss---------") pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 for l in range(0,5): vars1.append(-0.2) occl1.append(-0.2) else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (255, 0, 0), 3) if cfg.MASK.MASK: cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) print(idx) cv2.putText(img, 'occl_gt:'+str(video.tags['occlusion'][idx-1]), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2) cv2.putText(img, 'proposed_TL:'+str(lost_number), (40, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, 'occl_pred:'+str(vars1[idx-1]), (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) out.write(img) cv2.imwrite(video_loc+str(idx)+'.png',img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results out.release() video_path = os.path.join(args.results, args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}, mIOU: {:0.4f}'.format( v_idx+1, video.name, toc, idx / toc, lost_number, np.array(overlaps1).mean())) # plt.plot(overlaps1) # plt.plot(np.array(vars0)[:,3]) # plt.plot(np.array(occl1)) # plt.plot(np.array(vars1)) # print(np.correlate(overlaps1,np.array(vars1)[:,2])) overlaps2.append(np.array(overlaps1).mean()) # occl2.append(np.array(occl1)) # vars2.append(np.array(vars1)) # if args.video != '': # v_idx=0 # print(100*(confusion_matrix(occl2[v_idx],vars2[v_idx]).ravel())) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) # cv2.destroyAllWindows() # print("Total Mean IOU is %0.4f"%np.array(overlaps2).mean()) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.rewaitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('../results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('../results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('../results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( v_idx+1, video.name, toc, idx / toc ))
def main(frame_interval, interpolation_rate): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset) # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 0, 255), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (255, 0, 0), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) window_name = "Result" cv2.moveWindow(window_name, 100, 100) cv2.waitKey(1) toc /= cv2.getTickFrequency() # # save results # video_path = os.path.join('results', args.dataset, model_name, # 'baseline', video.name) # if not os.path.isdir(video_path): # os.makedirs(video_path) # result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) # with open(result_path, 'w') as f: # for x in pred_bboxes: # if isinstance(x, int): # f.write("{:d}\n".format(x)) # else: # f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( # v_idx+1, video.name, toc, idx / toc, lost_number)) # total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # FPS List fps_list = [] # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] # PARAMETERS # frame_interval = 2 # interpolation_rate = 0.005 for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] # (left-top width height) tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] # (left-top width height) pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) ###################################### # Adaptive Template(exemplar update) # ###################################### if idx % frame_interval == 0: tracker.update_z(img, pred_bbox, interpolation_rate=interpolation_rate) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (255, 0, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) window_name = "Result" cv2.moveWindow(window_name, 20, 20) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) # OTB-100 HERE!!!!!!!!!!!!!! else: result_folder_name = "results_{0:d}frame_exemplar_update_rate_{1:s}".format( frame_interval, str(interpolation_rate)) model_path = os.path.join(result_save_base_path, result_folder_name, args.dataset, model_name) # model_path = os.path.join(result_save_base_path, 'results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc)) # FPS Result fps = idx / toc fps_list.append(fps) # Make FPS Result Path fps_array = np.asarray(fps_list).reshape(-1, 1) fps_file_name = "model_fps__[{:3.1f}].txt".format( np.average(fps_array)) model_fps_file = os.path.join(os.path.dirname(model_path), "../", fps_file_name) np.savetxt(model_fps_file, fps_array)
def main(): # load config cfg.merge_from_file(args.config) #!!! input your dataset path dataset_root = os.path.join(your_dataset_path, args.dataset) # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.snapshot.split('/')[-1].split('.')[0] model_name = model_name + '_pk-{:.3f}'.format( cfg.TRACK.PENALTY_K) + '_wi-{:.3f}'.format( cfg.TRACK.WINDOW_INFLUENCE) + '_lr-{:.3f}'.format(cfg.TRACK.LR) total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): '''change save_path to yours''' save_path = '/home/masterbin-iiau/Desktop/AdvTrack-project/supplementary/%s' % args.video if not os.path.exists(save_path): os.mkdir(save_path) # load config cfg.merge_from_file(args.config) dataset_root = os.path.join(dataset_root_, args.dataset) # create model '''a model is a Neural Network.(a torch.nn.Module)''' model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker '''a tracker is a object, which consists of not only a NN but also some post-processing''' tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) # model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: '''GAN''' outputs = tracker.track_supp(img, GAN, save_path, idx) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track_supp(img, GAN, save_path, idx) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency()
def main(): net = models.__dict__[args.arch](anchors_nums=args.anchor_nums, cls_type=args.cls_type) net = load_pretrain(net, args.resume) net.eval() net = net.cuda() # prepare tracker info = edict() info.arch = args.arch info.cls_type = args.cls_type info.dataset = args.dataset info.epoch_test = args.epoch_test tracker = SiamRPN(info) dataset_root = os.path.join("/ssd", args.dataset) dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.resume.split('/')[-1].split('.')[0] total_lost = 0 """ eao will lower than origin version(0.393->0.390) due to the number of digits after the decimal point """ if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): # if len(gt_bbox) == 4: # gt_bbox = [gt_bbox[0], gt_bbox[1], # gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) #pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) #outputs = tracker.track(img) pred_bbox = location #overlap=poly_iou(gt_bbox,location) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bbox = location #outputs = tracker.track(img) #pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(state['score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(args.dataset_dir, args.dataset) epsilon = args.epsilon # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().train() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False, config=cfg) # # vid.name = {'ants1','ants3',....} # img, bbox, cls, delta, delta_weight # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4] model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue # set writing video parameters height, width, channels = video[0][0].shape out = cv2.VideoWriter( os.path.join(args.savedir, video.name + '.avi'), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15, (width, height)) frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] data = {'template': None, 'search': None} for idx, (img, gt_bbox, cls, delta_cls, delta_w, _bbox, cls_s, delta_cls_s, delta_w_s, _bbox_s) \ in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) nimg, sz, box, _ = tracker.crop(img, bbox=gt_bbox_, im_name='exemplar') data['template'] = torch.autograd.Variable( nimg, requires_grad=True).cuda() elif idx > frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] nimg, sz, box, pad = tracker.crop(img, bbox=gt_bbox_, is_template=False, im_name='search' + str(idx)) [bT, bB, bL, bR] = box sz = int(sz) data['search'] = torch.autograd.Variable( nimg, requires_grad=True).cuda() data['label_cls'] = torch.Tensor(cls_s).type( torch.LongTensor).cuda() data['label_loc'] = torch.Tensor(delta_cls_s).type( torch.FloatTensor).cuda() data['label_loc_weight'] = torch.Tensor(delta_w_s).cuda() outputs = model(data) cls_loss = outputs['cls_loss'] loc_loss = outputs['loc_loss'] total_loss = outputs['total_loss'] total_loss.backward() data_grad = data['search'].grad # torch.Tensor(img.transpose([2, 0, 1])).unsqueeze(dim=0) perturb_data = fgsm_attack(data['search'], epsilon, data_grad) # cv2.imwrite(os.path.join(args.savedir, 'original_' + str(idx) + '.jpg'), img) # _img = perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0]) # cv2.imwrite(os.path.join(args.savedir, 'perturb_' + str(idx) + '.jpg'), _img) if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, sz): perturb_data = F.interpolate(perturb_data, size=sz) _img = perturb_data.data.cpu().numpy().squeeze().transpose( [1, 2, 0]) # cv2.imwrite(os.path.join(args.savedir, 'crop_full_' + str(idx) + '.jpg'), _img) nh, nw, _ = _img.shape img[bT:bB + 1, bL:bR + 1, :] = _img[pad[0]:nh - pad[1], pad[2]:nw - pad[3], :] # cv2.imwrite(os.path.join(args.savedir, 'perturb_full_' + str(idx) + '.jpg'), img) outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: print('*************** lost ***************') import pdb pdb.set_trace() # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 print(idx, torch.sum(data_grad, (2, 3))) print( idx, torch.sum(torch.abs(torch.sum(data_grad, (2, 3))), (0, 1))) else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) # save tracking image bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) # cv2.imwrite(os.path.join(args.savedir, 'track_' + str(idx) + '.jpg'), img) out.write(img) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config # save_siamese_rpn() cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) # dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset) dataset_root = datasets_root + args.dataset # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # save_backbone(model) # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 #multi-pass tracking,跟踪丢失后重新初始化的测试方法 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 # pred_bboxes包含两种类型的数据,类型1:整型数据,有1,2,0,三个值,分别表示跟踪开始,跟踪结束(丢失),跟踪丢失之后,间隔帧的占位符 # 类型2:浮点类型的bbox,也就是跟踪结果 pred_bboxes = [] gru_seq_len = tracker.model.grus.seq_in_len video_len = len(video) for idx, (img, gt_bbox) in enumerate(video): if len( gt_bbox ) == 4: #如果gt是【x,y,w,h】的方式,转化为8个坐标信息(x1,y1,x2,y2,x3,y3,x4,y4) gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() #跟踪初始化 if idx == frame_counter: # 跟踪第一帧初始化 idxs = list( map(lambda x, y: x + y, [idx] * gru_seq_len, list(range( 0, gru_seq_len)))) # 取出idx后面的gru_seq_len个序列的索引号 idxs = list(map(lambda x: min(x, video_len - 1), idxs)) # 避免索引号越界 tracker.template_idx = 0 #模板初始化的第一帧 for k in idxs: init_img, init_gt_bbox = video[k] #连续gru_seq_len帧初始化 #init_img, init_gt_bbox =video[idxs[0]] #只用一帧作为初始化参数 cx, cy, w, h = get_axis_aligned_bbox( np.array(init_gt_bbox) ) #将倾斜框4个点坐标,转化为bbox,x,y为中心点形式(cx,cy,w,h) init_gt_bbox = [ cx - (w - 1) / 2, cy - (h - 1) / 2, w, h ] #x,y,中心点形式,转化为左上角形式 tracker.init_gru(init_img, init_gt_bbox, k) if k == 0: pred_bbox = init_gt_bbox pred_bboxes.append(1) #持续的后续跟踪 elif idx > frame_counter: outputs = tracker.track(img) #对于下面的帧 pred_bbox = outputs['bbox'] #只有输出概率很高的时候才更新模板 if outputs['best_score'] > 0.95: tracker.init_gru(img, pred_bbox, idx) if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) #查看初始化后的第一帧检测iou和score之间的关系 # if tracker.template_idx==4: # print("{:3.2f}\t{:3.2f}".format(overlap,outputs['best_score'])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() #绘制输出框,gt和mask都按照多边形来绘制,跟踪的bbox按照矩形来绘制 if args.vis and idx > frame_counter: #绘制多边形的gt cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) #绘制siamesemask输出的多边形 if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) #绘制输出矩形框 else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) #添加图像标注,帧号和丢失次数 cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) #结果路径的构成: ./results/VOT2018/model/baseline/ants1/ants1_001.txt result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) #pred_bboxes包含两种类型的数据,类型1:整型数据,有1,2,0,三个值,分别表示跟踪开始,跟踪结束(丢失),跟踪丢失之后,间隔帧的占位符 # 类型2:浮点类型的bbox,也就是跟踪结果 with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): #整数代表开始,或者有丢失 f.write("{:d}\n".format(x)) else: #浮点数才是bbox f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) #oetracking,跟踪丢失后不再重新初始化的测试方法 else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg.merge_from_file(args.config) # -------------------------------------hp_search---------------------------------------# params = [0.0, 0.0, 0.0] # Interpolation learning rate params[0] = cfg.TRACK.LR # Scale penalty params[1] = cfg.TRACK.PENALTY_K # Window influence params[2] = cfg.TRACK.WINDOW_INFLUENCE params_name = args.snapshot.split( '/')[-1] + ' ' + args.dataset + ' lr-' + str( params[0]) + ' pk-' + '_' + str(params[1]) + ' win-' + '_' + str( params[2]) # -------------------------------------hp_search---------------------------------------# # cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join('./datasets', args.dataset) model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker siamos tracker = SiamCARTracker(model, cfg.TRACK) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: total_lost = 0 avg_speed = 0 # linlin for v_idx, video in tqdm(enumerate(dataset)): #for v_idx, video in tqdm(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] #[topx,topy,w,h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( # v_idx+1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number avg_speed += idx / toc print('Speed: {:3.1f}fps'.format(avg_speed / 60)) print(params_name) #print(" stage:{:d} model:{:s} epoch:{:s} update_lr:{:f}".format(args.update_stage,args.update_path, args.update_path.split('/')[-1],update_lr[args.update_lr])) else: # OPE tracking for v_idx, video in tqdm(enumerate(dataset)): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] #[topx,topy,w,h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) #scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join(args.save_path, args.dataset, args.tracker_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') # print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( # v_idx+1, video.name, toc, idx / toc)) print(params_name) # os.chdir(model_path) # save_file = '../%s' % dataset # shutil.make_archive(save_file, 'zip') #print('Records saved at', save_file + '.zip') evaluate(args)
def main(args, tracker): # create dataset if not args.dataset_path: args.dataset_path = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(args.dataset_path, 'dataset', args.dataset) dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False, single_video=args.video) model_name = args.model_name if args.debug_vis: args.vis = True total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019', 'VOT2020']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue video_total_lost = 0 for cnt in range(args.repetition): frame_counter = 0 lost_number = 0 toc = 0 init_toc = 0 valid_frames = 0 pred_bboxes = [] template_image = None search_image = None raw_heatmap = None post_heatmap = None for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3], gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3], gt_bbox[0]+gt_bbox[2], gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - w/2, cy - h/2, w, h] tracker.init(img, gt_bbox_) init_toc += cv2.getTickCount() - tic pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bbox = [pred_bbox[0], pred_bbox[1], pred_bbox[2] - pred_bbox[0], pred_bbox[3] - pred_bbox[1]] valid_frames += 1 toc += cv2.getTickCount() - tic overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 if args.vis and args.debug_vis: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 2))], True, (0, 255, 0), 3) bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, 'lost', (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) for key, value in outputs.items(): if isinstance(value, np.ndarray): if len(value.shape) == 3 or len(value.shape) == 2: cv2.imshow(key, value) k = cv2.waitKey(0) if k == 27: # wait for ESC key to exit sys.exit() else: pred_bboxes.append(0) if idx == 0: if args.vis: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 2))], True, (0, 255, 0), 3) bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) if args.debug_vis: for key, value in outputs.items(): if isinstance(value, np.ndarray): if len(value.shape) == 3 or len(value.shape) == 2: cv2.imshow(key, value) k = cv2.waitKey(0) if k == 27: # wait for ESC key to exit break else: k = cv2.waitKey(1) if k == 27: # wait for ESC key to exit break sys.stderr.write("inference on {}: {} / {}\r".format(video.name, idx+1, len(video))) toc /= cv2.getTickFrequency() init_toc /= cv2.getTickFrequency() # save results video_path = os.path.join(args.result_path, args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_{:03d}.txt'.format(video.name, cnt+1)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') log = '({:3d}) Video ({:2d}): {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( v_idx+1, cnt+1, video.name, init_toc + toc, valid_frames / toc, lost_number) print(log) with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f: f.write(log + '\n') video_total_lost += lost_number total_lost += video_total_lost if args.repetition > 1: log = '({:3d}) Video: {:12s} Avg Lost: {:.3f}'.format(v_idx+1, video.name, video_total_lost/args.repetition) print(log) with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f: f.write(log + '\n') log = "{:s} total (avg) lost: {:.3f}".format(model_name, total_lost/args.repetition) print(log) with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f: f.write(log + '\n') else: # OPE tracking find_best = True if not dataset.has_ground_truth: find_best = False # if repeat 3 times for GOT-10k, use the official benchmark mode (no find best) if args.dataset == 'GOT-10k': if args.repetition == 3: find_best = False for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue best_pred_bboxes = [] min_lost_number = 1e6 for cnt in range(args.repetition): toc = 0 init_toc = 0 pred_bboxes = [] track_times = [] template_image = None search_image = None raw_heatmap = None post_heatmap = None lost_number = 0 if find_best and min_lost_number < args.min_lost_rate_for_repeat * len(video): print("Abolish reset of trails ({}~) becuase the min lost number is small enough: {} / {}".format(cnt+1 , min_lost_number, args.min_lost_rate_for_repeat * len(video))) break save_image_offset = 0 if args.save_image_num_per_video > 1: save_image_offset = len(video) // (args.save_image_num_per_video - 1) if args.save_image_num_per_video == 0: save_image_offset = 1 for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: outputs = tracker.init(img, gt_bbox) init_toc += cv2.getTickCount() - tic pred_bbox = gt_bbox pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) toc += cv2.getTickCount() - tic pred_bbox_ = outputs['bbox'] pred_bbox = [pred_bbox_[0], pred_bbox_[1], pred_bbox_[2] - pred_bbox_[0], pred_bbox_[3] - pred_bbox_[1]] pred_bboxes.append(pred_bbox) track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) gt_bbox_int = list(map(lambda x: int(x) if not np.isnan(x) else 0, gt_bbox)) pred_bbox_int = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox_int[0], gt_bbox_int[1]), (gt_bbox_int[0]+gt_bbox_int[2], gt_bbox_int[1]+gt_bbox_int[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox_int[0], pred_bbox_int[1]), (pred_bbox_int[0]+pred_bbox_int[2], pred_bbox_int[1]+pred_bbox_int[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) if save_image_offset > 0: image_path = os.path.join(args.result_path, args.dataset, model_name, 'images', video.name) if not os.path.isdir(image_path): os.makedirs(image_path) if idx % save_image_offset == 0: imagename = os.path.join(image_path, 'image{:03d}.jpg'.format(idx)) cv2.imwrite(imagename,img) if idx == 0: if args.vis: cv2.destroyAllWindows() if args.debug_vis and isinstance(outputs, dict): for key, value in outputs.items(): if isinstance(value, np.ndarray): if len(value.shape) == 3 or len(value.shape) == 2: cv2.imshow(key, value) else: if not gt_bbox == [0,0,0,0] and not np.isnan(np.array(gt_bbox)).any(): if pred_bbox[0] + pred_bbox[2] < gt_bbox[0] or pred_bbox[0] > gt_bbox[0] + gt_bbox[2] or pred_bbox[1] + pred_bbox[3] < gt_bbox[1] or pred_bbox[1] > gt_bbox[1] + gt_bbox[3]: lost_number += 1 if find_best and lost_number > min_lost_number: break if args.vis or args.debug_vis: cv2.imshow(video.name, img) if args.debug_vis: for key, value in outputs.items(): if isinstance(value, np.ndarray): if len(value.shape) == 3 or len(value.shape) == 2: cv2.imshow(key, value) k = cv2.waitKey(0) if k == 27: # wait for ESC key to exit min_lost_number = 1e6 # this allows to try args.repetition times for debug lost_number = 1e6 # this allows to try args.repetition times for debug break else: k = cv2.waitKey(1) if k == 27: # wait for ESC key to exit min_lost_number = 1e6 # this allows to try args.repetition times for debug lost_number = 1e6 # this allows to try args.repetition times for debug break sys.stderr.write("inference on {}: {} / {}\r".format(video.name, idx+1, len(video))) if find_best and lost_number > min_lost_number: print('Stop No.{} trial becuase the lost number already exceed the min lost number: {} > {} '.format(cnt+1, lost_number, min_lost_number)) continue if lost_number == 1e6: continue if lost_number < min_lost_number: min_lost_number = lost_number toc /= cv2.getTickFrequency() init_toc /= cv2.getTickFrequency() # save results if 'GOT-10k' == args.dataset: video_path = os.path.join(args.result_path, args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) id = cnt + 1 if find_best: id = 1 result_path = os.path.join(video_path, '{}_{:03d}.txt'.format(video.name, id)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([vot_float2str("%.4f", i) for i in x ])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join(args.result_path, args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') log = '({:3d}) Video: {:12s} Trail: {:2d} Time: {:5.1f}s Speed: {:3.1f}fps Lost: {:d}/{:d}'.format( v_idx+1, video.name, cnt+1, init_toc + toc, idx / toc, lost_number, len(video)) print(log) with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f: f.write(log + '\n')
def run_tracker(tracker, img, gt, video_name, video, restart=True): frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] if restart: # VOT2016 and VOT 2018 for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append([1]) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append([2]) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append([0]) toc += cv2.getTickCount() - tic toc /= cv2.getTickFrequency() # print('Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(video_name, toc, idx / toc, lost_number)) return pred_bboxes else: toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) toc /= cv2.getTickFrequency() # print('Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(video_name, toc, idx / toc)) return pred_bboxes, scores, track_times
def main(): # load config model_name = 'RT_MDNet_refine' MASK = False dataset_root = os.path.join(dataset_root_, args.dataset) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) '''##### build a scale-estimator #####''' SE_module = Scale_Estimator_bcm(refine_checkpoint_dir_) # model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [ gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] ] tic = cv2.getTickCount() if idx == frame_counter: H, W, _ = img.shape cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.init(img, gt_bbox_) '''##### initilize scale-estimator for specific video''' # SE_module.initialize(cv2.cvtColor(img,cv2.COLOR_BGR2RGB), # np.array(gt_bbox_)) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] '''##### refine tracking results #####''' # output_dict = SE_module.refine_all(cv2.cvtColor(img,cv2.COLOR_BGR2RGB), # np.array(pred_bbox)) # pred_bbox = 0.5 * (output_dict['bbox'] + output_dict['corner']) lr = outputs['lr'] pred_bbox = tracker.smooth_bbox(pred_bbox, lr, H, W) x1, y1, w, h = pred_bbox tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2]) tracker.size = np.array([w, h]) # if cfg.MASK.MASK: # pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if v_idx < 60: continue tracker = RT_MDNet() if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): img_RGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB format tic = cv2.getTickCount() if idx == 0: H, W, _ = img.shape cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] tracker.initialize_seq(img_RGB, np.array(gt_bbox_)) '''##### initilize scale-estimator for specific video''' SE_module.initialize(img_RGB, np.array(gt_bbox_)) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: ori_bbox = tracker.track(img_RGB) '''##### refine tracking results #####''' output_dict = SE_module.refine_all(img_RGB, np.array(ori_bbox)) pred_bbox = 0.5 * (output_dict['bbox'] + output_dict['corner']) pred_bbox = bbox_clip(pred_bbox, (H, W)) tracker.target_bbox = pred_bbox.copy() pred_bboxes.append(pred_bbox) # scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) ori_bbox = list(map(int, ori_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 0, 255), 3) cv2.rectangle( img, (ori_bbox[0], ori_bbox[1]), (ori_bbox[0] + ori_bbox[2], ori_bbox[1] + ori_bbox[3]), (255, 0, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg.merge_from_file(args.config) cur_dir = os.path.dirname(os.path.realpath(__file__)) dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset) # create model model = ModelBuilder(cfg) # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.snapshot.split('/')[-1].split('.')[0] total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( v_idx+1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('results', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format( v_idx+1, video.name, toc, idx / toc))
def main(): cfg.merge_from_file(args.config) dataset_root = os.path.join('./datasets', args.dataset) params = [0.0,0.0,0.0] params[0] =cfg.TRACK.LR params[1]=cfg.TRACK.PENALTY_K params[2] =cfg.TRACK.WINDOW_INFLUENCE params_name = args.snapshot.split('/')[-1] + ' '+ args.dataset + ' lr-' + str(params[0]) + ' pk-' + '_' + str(params[1]) + ' win-' + '_' + str(params[2]) # create model model = ModelBuilder() # load model model = load_pretrain(model, args.snapshot).cuda().eval() # build tracker tracker = build_tracker(model) # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: total_lost=0 avg_speed =0 for v_idx, video in tqdm(enumerate(dataset)): if args.video != '': if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): if len(gt_bbox) == 4: gt_bbox = [gt_bbox[0], gt_bbox[1], gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] #[topx,topy,w,h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: outputs = tracker.track(img) pred_bbox = outputs['bbox'] if cfg.MASK.MASK: pred_bbox = outputs['polygon'] overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: pred_bboxes.append(pred_bbox) else: pred_bboxes.append(2) frame_counter = idx + 5 lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n') total_lost += lost_number avg_speed += idx / toc print('Speed: {:3.1f}fps'.format(avg_speed/60)) print(params_name) else: # OPE tracking for v_idx, video in tqdm(enumerate(dataset)): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] #[topx,topy,w,h] tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: outputs = tracker.track(img) pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) #scores.append(outputs['best_score']) toc += cv2.getTickCount() - tic track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write("{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join(args.save_path, args.dataset, args.tracker_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x])+'\n') eval(args)