def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_step_{:.4f}'.format(hp['scale_step']) + '_penalty_s_{:.4f}'.format(hp['scale_penalty']) + '_w_influence_{:.4f}'.format(hp['w_influence']) + '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace('.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: return tracker_path start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(location) with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') return tracker_path
def track(self, image_file): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training self.state = self.siam_tracker.track(self.state, im) location = cxy_wh_2_rect(self.state["target_pos"], self.state["target_sz"]) return location
def track_video(tracker, model, video_path, init_box=None): assert os.path.isfile(video_path), "please provide a valid video file" cap = cv2.VideoCapture(video_path) display_name = 'Video: {}'.format(video_path.split('/')[-1]) cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) cv2.resizeWindow(display_name, 960, 720) success, frame = cap.read() #cv2.imshow(display_name, frame) if success is not True: print("Read failed.") exit(-1) # init if init_box is not None: lx, ly, w, h = init_box target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame, target_pos, target_sz, model) # init tracker else: while True: frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 1) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # init tracker break while True: ret, frame = cap.read() if frame is None: return frame_disp = frame.copy() # Draw box state = tracker.track(state, frame_disp) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int( location[1]), int(location[0] + location[2]), int(location[1] + location[3]) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5) font_color = (0, 0, 0) cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) # Display the resulting frame cv2.imshow(display_name, frame_disp) key = cv2.waitKey(1) if key == ord('q'): break elif key == ord('r'): ret, frame = cap.read() frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 1) cv2.imshow(display_name, frame_disp) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # When everything done, release the capture cap.release() cv2.destroyAllWindows()
def track_images(tracker, model, images_path, init_box=None): assert os.path.isdir(images_path), "please provide a valid folder name" display_name = 'Video: {}'.format(images_path.split('/')[-1]) cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) cv2.resizeWindow(display_name, 960, 720) im_paths = [(images_path + '/' + f) for f in os.listdir(images_path) if '.jpg' in f] if len(im_paths) == 0: print("no jpg images found in dir") exit(-1) frame = cv2.imread(im_paths[0]) #cv2.imshow(im_paths[0].split('/')[-1], frame) # init if init_box is not None: lx, ly, w, h = init_box target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame, target_pos, target_sz, model) # init tracker else: while True: frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 1) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # init tracker break path_idx = 0 while path_idx < len(im_paths): time.sleep(2) path_idx += 1 frame = cv2.imread(im_paths[path_idx]) if frame is None: return frame_disp = frame.copy() # Draw box state = tracker.track(state, frame_disp) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int( location[1]), int(location[0] + location[2]), int(location[1] + location[3]) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5) font_color = (0, 0, 0) cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) # Display the resulting frame cv2.imshow(display_name, frame_disp) key = cv2.waitKey(1) if key == ord('q'): break elif key == ord('r'): path_idx += 1 frame = cv2.imread(im_paths[path_idx]) frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 1) cv2.imshow(display_name, frame_disp) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # When everything done, release the capture cv2.destroyAllWindows()
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_step_{:.4f}'.format(hp['scale_step']) + '_penalty_s_{:.4f}'.format(hp['scale_penalty']) + '_w_influence_{:.4f}'.format(hp['w_influence']) + '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace( '.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') elif 'GOT10K' in benchmark_name: re_video_path = os.path.join(tracker_path, video['name']) if not exists(re_video_path): os.makedirs(re_video_path) result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name'])) else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('OTB'): return tracker_path elif benchmark_name.startswith('VOT') or benchmark_name.startswith( 'GOT10K'): return 0 else: print('benchmark not supported now') return start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 if b_overlap > 0: regions.append(location) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if 'OTB' in benchmark_name or 'LASOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name: return tracker_path else: print('benchmark not supported now')
def track(siam_tracker, online_tracker, siam_net, video, args): start_frame, toc = 0, 0 # save result to evaluate if args.epoch_test: suffix = args.resume.split('/')[-1] suffix = suffix.split('.')[0] tracker_path = os.path.join('result', args.dataset, args.arch + suffix) else: tracker_path = os.path.join('result', args.dataset, args.arch) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return # for mult-gputesting regions = [] lost = 0 image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = siam_tracker.init(im, target_pos, target_sz, siam_net) # init tracker if args.online: online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume) # location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking if args.online: state = online_tracker.track(im, rgb_im, siam_tracker, state) else: state = siam_tracker.track(state, im) location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap > 0: regions.append(location) else: regions.append(2) start_frame = f + 5 lost += 1 else: regions.append(0) toc += cv2.getTickCount() - tic with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') elif 'OTB' in args.dataset or 'LASOT' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset or 'TN' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost {}'.format( video['name'], toc, f / toc, lost))
def track(video, args): start_frame, toc = 0, 0 # save result to evaluate tracker_path = os.path.join('result', args.dataset) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return # for mult-gputesting regions = [] lost = 0 # for rgbt splited test in_image_files, rgb_image_files, gt = video['infrared_imgs'], video[ 'visiable_imgs'], video['gt'] for f, in_f in enumerate(in_image_files): in_im = cv2.imread(in_f) in_im = cv2.cvtColor(in_im, cv2.COLOR_BGR2RGB) # align with training rgb_im = cv2.imread(rgb_image_files[f]) rgb_im = cv2.cvtColor(rgb_im, cv2.COLOR_BGR2RGB) # align with training tic = cv2.getTickCount() if f == start_frame: # init print('===============> init tracker') tracker = AASTracker(rgb_im, in_im, gt[f]) regions.append(1) elif f > start_frame: # tracking state = tracker.track(rgb_im, in_im) pos = np.array([state[0], state[1]]) sz = np.array([state[2], state[3]]) location = cxy_wh_2_rect(pos, sz) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap > 0: regions.append(location) else: regions.append(2) start_frame = f + 5 lost += 1 else: regions.append(0) toc += cv2.getTickCount() - tic with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost {}'.format( video['name'], toc, f / toc, lost))
def track_video(tracker, model, video_path, init_box=None): cap = cv2.VideoCapture(video_path) display_name = 'Video: {}'.format(video_path.split('/')[-1]) cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) cv2.resizeWindow(display_name, 960, 720) success, frame = cap.read() cv2.imshow(display_name, frame) if success is not True: print("Read failed.") exit(-1) # init if init_box is not None: lx, ly, w, h = init_box f.write(lx + "\t" + ly + "\t" + w + "\t" + h + "\n") target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame, target_pos, target_sz, model) # init tracker else: while True: frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 1) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) f.write(lx + "\t" + ly + "\t" + w + "\t" + h + "\n") target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # init tracker break i = 0 restarts = 0 assert os.path.exists( './bbox' ), "Please create a directory called bbox to store the values of the bounding boxes" f = open('./bbox/SiamRPN.txt', '+a') box_color = (0, 255, 0) while True: ret, frame = cap.read() if frame is None: return frame_disp = frame.copy() timer = cv2.getTickCount() # Draw box state = tracker.track(state, frame_disp) # track fps = cv2.getTickFrequency() / (cv2.getTickCount() - timer) location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int( location[1]), int(location[0] + location[2]), int(location[1] + location[3]) font_color = (0, 0, 255) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), box_color, 5) cv2.putText(frame_disp, 'Restarts: ' + str(restarts), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2) cv2.putText(frame_disp, 'Frame Number: ' + '{:04d}'.format(i), (20, 55), cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2) cv2.putText(frame_disp, 'FPS: ' + str(fps), (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2) frame_name = '{:04d}'.format(i) assert os.path.exists( "./output"), "Please create a directory to store the output files" cv2.imwrite("./output/" + str(frame_name) + ".png", frame_disp) x = x1 y = y1 w = x2 - x1 h = y2 - y1 bbox = [x, y, w, h] box = [str(i) for i in bbox] f.write(box[0] + "\t" + box[1] + "\t" + box[2] + "\t" + box[3] + "\n") # Display the resulting frame cv2.imshow(display_name, frame_disp) key = cv2.waitKey(1) if key == ord('q'): break elif key == ord('r'): restarts = restarts + 1 if restarts % 2 == 1: box_color = (255, 0, 0) else: box_color = (0, 255, 0) cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 1) cv2.imshow(display_name, frame_disp) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) i = i + 1 # When everything done, release the capture os.rename('./bbox/SiamRPN.txt', './bbox/SiamRPN_' + str(restarts) + '.txt') f.close() cap.release() cv2.destroyAllWindows()
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config[ 'hp'] # penalty_k, scale_lr, window_influence, adaptive size (for vot2017 or later) tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_small_size_{:.4f}'.format(hp['small_sz']) + '_big_size_{:.4f}'.format(hp['big_sz']) + '_penalty_k_{:.4f}'.format(hp['penalty_k']) + '_w_influence_{:.4f}'.format(hp['window_influence']) + '_scale_lr_{:.4f}'.format(hp['lr'])).replace( '.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') else: raise ValueError('Only VOT is supported') # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('VOT'): return 0 else: raise ValueError('Only VOT is supported') start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 if b_overlap > 0: regions.append(location) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if benchmark_name.startswith('VOT'): return regions else: raise ValueError('Only VOT is supported')
def main(): # load config cfg_from_file(args.config) dataset_root = os.path.join('dataset', args.dataset) # create model net = ModelBuilder() checkpoint = torch.load(args.model) if 'state_dict' in checkpoint: net.load_state_dict(checkpoint['state_dict']) else: net.load_state_dict(checkpoint) net.cuda().eval() # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.save_name total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(1) elif idx > frame_counter: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_polygon = [ pred_bbox[0], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3], pred_bbox[0], pred_bbox[1] + pred_bbox[3] ] overlap = vot_overlap(gt_bbox, pred_polygon, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() # save results video_path = os.path.join('result', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: if 'OTB' in args.dataset: target_pos, target_sz = rect1_2_cxy_wh(gt_bbox) else: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy ]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) if 'OTB' in args.dataset: pred_bbox = cxy_wh_2_rect1(state['target_pos'], state['target_sz']) else: pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) else: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() if 'GOT-10k' == args.dataset: video_path = os.path.join('result', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_small_size_{:.4f}'.format(hp['small_sz']) + '_big_size_{:.4f}'.format(hp['big_sz']) + '_lambda_u_{:.4f}'.format(hp['choose_thr']) + '_lambda_s_{:.4f}'.format(hp['choose_thr']) + '_cyclic_thr_{:.4f}'.format(hp['choose_thr']) + '_choose_thr_{:.4f}'.format(hp['choose_thr']) + '_penalty_k_{:.4f}'.format(hp['penalty_k']) + '_w_influence_{:.4f}'.format(hp['window_influence']) + '_scale_lr_{:.4f}'.format(hp['lr'])).replace('.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') elif 'GOT10K' in benchmark_name: re_video_path = os.path.join(tracker_path, video['name']) if not exists(re_video_path): os.makedirs(re_video_path) result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name'])) else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('OTB'): return tracker_path elif benchmark_name.startswith('VOT') or benchmark_name.startswith('GOT10K'): return 0 else: print('benchmark not supported now') return start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] # for rgbt splited test image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) mask_gt = None state = tracker.init(im, target_pos, target_sz, net, online=False, mask=mask_gt, debug=False, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 polygon = state['polygon'] if not polygon is None: polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]] polygon = np.array(polygon) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']: record = polygon else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) record = polygon if not 'VOT' in benchmark_name: # change polygon to [x, y, w, h] x1, y1, x2, y2 = record[0], record[1], record[4], record[5] record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1]) if b_overlap > 0: regions.append(record) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if 'OTB' in benchmark_name or 'LASOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name: return tracker_path else: print('benchmark not supported now')
def track_box(siam_tracker, online_tracker, siam_net, video, args): """ track a benchmark with only box annoated attention: not for benchmark evaluation, just a demo """ tracker_path = os.path.join('result', args.dataset, args.arch) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return # for mult-gputesting regions = [] b_overlaps, b_overlaps2, b_overlaps3 = [], [], [] lost = 0 start_frame, toc = 0, 0 image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if args.online: rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) mask_gt = None state = siam_tracker.init(im, target_pos, target_sz, siam_net, online=args.online, mask=mask_gt, debug=args.debug) # init siamese tracker if args.online: online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume) elif f > start_frame: # tracking if args.online: state = online_tracker.track(im, rgb_im, siam_tracker, state) else: state = siam_tracker.track(state, im, name=image_file) mask = state['mask'] location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 polygon = state['polygon'] if not polygon is None: polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]] polygon = np.array(polygon) # b_overlap2 = poly_iou(gt[f], polygon) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']: record = polygon # b_overlaps3.append(b_overlap2) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) record = polygon # b_overlaps3.append(b_overlap) # print('b_overlap: {}, b_overlap2: {}'.format(b_overlap, b_overlap2)) # b_overlaps.append(b_overlap) # b_overlaps2.append(b_overlap2) if not 'VOT' in benchmark_name: # change polygon to [x, y, w, h] x1, y1, x2, y2 = record[0], record[1], record[4], record[5] record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1]) if b_overlap > 0: regions.append(record) else: regions.append(2) start_frame = f + 5 lost += 1 if args.vis: COLORS = np.random.randint(128, 255, size=(1, 3), dtype="uint8") COLORS = np.vstack([[0, 0, 0], COLORS]).astype("uint8") mask = COLORS[mask] output = ((0.4 * im) + (0.6 * mask)).astype("uint8") cv2.imshow("mask", output) cv2.waitKey(1) toc += cv2.getTickCount() - tic # print('b_overlap: {}, b_overlap2: {}, b_overlap3: {}'.format(np.array(b_overlaps).mean(), np.array(b_overlaps2).mean(), np.array(b_overlaps3).mean())) with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') elif 'OTB' in args.dataset or 'LASOT' in args.dataset: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(video['name'], toc, f / toc))
def track_video(tracker, model, video_path, init_box=None): files = os.listdir(video_path) files = sorted(files) init_image_file = os.path.join(video_path, files[0]) tracking_image_files = [os.path.join(video_path, f) for f in files[1:]] display_name = 'demo' frame = cv2.imread(init_image_file) cv2.imshow(display_name, frame) # init if init_box is not None: lx, ly, w, h = init_box target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame, target_pos, target_sz, model) # init tracker else: while True: frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 1) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # init tracker break for file in tracking_image_files: frame = cv2.imread(file) frame_disp = frame.copy() # Draw box state = tracker.track(state, frame_disp) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int( location[1]), int(location[0] + location[2]), int(location[1] + location[3]) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5) font_color = (0, 0, 0) cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) # Display the resulting frame cv2.imshow(display_name, frame_disp) key = cv2.waitKey(5) if key == ord('q'): break elif key == ord('r'): ret, frame = cap.read() frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 1) cv2.imshow(display_name, frame_disp) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # When everything done, release the capture cv2.destroyAllWindows()
def track_webcam(tracker, model, updatenet): """Run tracker with webcam.""" class UIControl: def __init__(self): self.mode = 'init' # init, select, track self.target_tl = (-1, -1) self.target_br = (-1, -1) self.mode_switch = False def mouse_callback(self, event, x, y, flags, param): if event == cv2.EVENT_LBUTTONDOWN and self.mode == 'init': self.target_tl = (x, y) self.target_br = (x, y) self.mode = 'select' self.mode_switch = True elif event == cv2.EVENT_MOUSEMOVE and self.mode == 'select': self.target_br = (x, y) elif event == cv2.EVENT_LBUTTONDOWN and self.mode == 'select': self.target_br = (x, y) self.mode = 'track' self.mode_switch = True def get_tl(self): return self.target_tl if self.target_tl[0] < self.target_br[0] else self.target_br def get_br(self): return self.target_br if self.target_tl[0] < self.target_br[0] else self.target_tl def get_bb(self): tl = self.get_tl() br = self.get_br() bb = [min(tl[0], br[0]), min(tl[1], br[1]), abs(br[0] - tl[0]), abs(br[1] - tl[1])] return bb # [lx, ly, w, h] ui_control = UIControl() cap = cv2.VideoCapture(0) display_name = 'SiamDW on webcam' cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) cv2.resizeWindow(display_name, 960, 720) cv2.setMouseCallback(display_name, ui_control.mouse_callback) while True: # Capture frame-by-frame ret, frame = cap.read() frame_disp = frame.copy() if ui_control.mode == 'track' and ui_control.mode_switch: ui_control.mode_switch = False lx, ly, w, h = ui_control.get_bb() target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = tracker.init(frame_disp, target_pos, target_sz, model) # init tracker # Draw box if ui_control.mode == 'select': cv2.rectangle(frame_disp, ui_control.get_tl(), ui_control.get_br(), (255, 0, 0), 2) elif ui_control.mode == 'track': state = tracker.track(state, frame_disp, updatenet) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int(location[1]), int(location[0] + location[2]), int( location[1] + location[3]) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5) # Put text font_color = (0, 0, 0) if ui_control.mode == 'init' or ui_control.mode == 'select': cv2.putText(frame_disp, 'Select target', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) elif ui_control.mode == 'track': cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) # Display the resulting frame cv2.imshow(display_name, frame_disp) key = cv2.waitKey(1) if key == ord('q'): break elif key == ord('r'): ui_control.mode = 'init' # When everything done, release the capture cap.release() cv2.destroyAllWindows()
def main(): net = models.__dict__[args.arch](anchors_nums=args.anchor_nums, cls_type=args.cls_type) net = load_pretrain(net, args.resume) net.eval() net = net.cuda() # prepare tracker info = edict() info.arch = args.arch info.cls_type = args.cls_type info.dataset = args.dataset info.epoch_test = args.epoch_test tracker = SiamRPN(info) dataset_root = os.path.join("/ssd", args.dataset) dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.resume.split('/')[-1].split('.')[0] total_lost = 0 """ eao will lower than origin version(0.393->0.390) due to the number of digits after the decimal point """ if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): # if len(gt_bbox) == 4: # gt_bbox = [gt_bbox[0], gt_bbox[1], # gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) #pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) #outputs = tracker.track(img) pred_bbox = location #overlap=poly_iou(gt_bbox,location) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bbox = location #outputs = tracker.track(img) #pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(state['score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def track_video(siam_tracker, online_tracker, siam_net, video_path, init_box=None, args=None): assert os.path.isfile(video_path), "please provide a valid video file" video_name = video_path.split('/')[-1] video_name = video_name.split('.')[0] save_path = os.path.join('vis', video_name) if not os.path.exists(save_path): os.makedirs(save_path) cap = cv2.VideoCapture(video_path) display_name = 'Video: {}'.format(video_path.split('/')[-1]) cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) cv2.resizeWindow(display_name, 960, 720) success, frame = cap.read() cv2.imshow(display_name, frame) if success is not True: print("Read failed.") exit(-1) # init count = 0 if init_box is not None: lx, ly, w, h = init_box target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = siam_tracker.init(frame, target_pos, target_sz, siam_net) # init tracker rgb_im = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if args.online: online_tracker.init(frame, rgb_im, siam_net, target_pos, target_sz, True, dataname='VOT2019', resume=args.resume) else: while True: frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 1) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = siam_tracker.init(frame_disp, target_pos, target_sz, siam_net) # init tracker rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB) if args.online: online_tracker.init(frame_disp, rgb_im, siam_net, target_pos, target_sz, True, dataname='VOT2019', resume=args.resume) break while True: ret, frame = cap.read() if frame is None: return frame_disp = frame.copy() rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB) # Draw box if args.online: state = online_tracker.track(frame_disp, rgb_im, siam_tracker, state) else: state = siam_tracker.track(state, frame_disp) location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) x1, y1, x2, y2 = int(location[0]), int( location[1]), int(location[0] + location[2]), int(location[1] + location[3]) cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5) font_color = (0, 0, 0) cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) # Display the resulting frame cv2.imshow(display_name, frame_disp) if args.save: save_name = os.path.join(save_path, '{:04d}.jpg'.format(count)) cv2.imwrite(save_name, frame_disp) count += 1 key = cv2.waitKey(1) # key = None if key == ord('q'): break elif key == ord('r'): ret, frame = cap.read() frame_disp = frame.copy() cv2.putText(frame_disp, 'Select target ROI and press ENTER', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 1) cv2.imshow(display_name, frame_disp) lx, ly, w, h = cv2.selectROI(display_name, frame_disp, fromCenter=False) target_pos = np.array([lx + w / 2, ly + h / 2]) target_sz = np.array([w, h]) state = siam_tracker.init(frame_disp, target_pos, target_sz, siam_net) # init tracker rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB) if args.online: online_tracker.init(frame_disp, rgb_im, siam_net, target_pos, target_sz, True, dataname='VOT2019', resume=args.resume) # When everything done, release the capture cap.release() cv2.destroyAllWindows()
if mask_vot: print('the input is a binary mask') selection = handle.region() mask = make_full_size(selection, (im.shape[1], im.shape[0])) bbox = rect_from_mask(mask) # [cx,cy,w,h] TODO: use cv.minmaxRect here cx, cy, w, h = bbox else: print('the input is a rect box') selection = handle.region() # selection in ncc_mask lx, ly, w, h = selection.x, selection.y, selection.width, selection.height cx, cy = lx + w / 2, ly + h / 2 target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, mask=mask) count = 0 while True: image_file = handle.frame() if not image_file: break im = cv2.imread(image_file) # HxWxC state = tracker.track(state, im) mask = state['mask'] if mask is None or mask.sum() < 10: rect = cxy_wh_2_rect(state['target_pos'], state['target_sz']) mask = mask_from_rect(rect, (im.shape[1], im.shape[0])) handle.report(mask, state['cls_score']) count += 1
def track(tracker, net, video, args): start_frame, lost_times, toc = 0, 0, 0 # save result to evaluate if args.epoch_test: suffix = args.resume.split('/')[-1] suffix = suffix.split('.')[0] tracker_path = os.path.join('result', args.dataset, args.arch + suffix) else: tracker_path = os.path.join('result', args.dataset, args.arch) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return 0 # for mult-gputesting regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net) # init tracker regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap > 0: regions.append(location) else: regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append(0) toc += cv2.getTickCount() - tic toc /= cv2.getTickFrequency() with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') else: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( video['name'], toc, f / toc, lost_times)) return lost_times
def track_video(model, video): start_frame, toc = 0, 0 # vis or save OTB result to evaluate if not args.vis: tracker_path = os.path.join( 'test', args.dataset, args.arch.split('.')[0] + args.resume.split('/')[-1].split('.')[0]) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if not os.path.exists(result_path): # for multi-gpu test fin = open(result_path, "w") fin.close() else: return regions = [] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_min_max_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = SiamFC_init(im, target_pos, target_sz, model) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking state = SiamFC_track(state, im) location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = judge_overlap(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap: regions.append(location) else: regions.append(2) start_frame = f + 5 else: regions.append(0) toc += cv2.getTickCount() - tic if bool(args.vis) and f >= start_frame: # visualization (skip lost frame) if f == 0: cv2.destroyAllWindows() cv2.rectangle(im, (int(gt[f, 0]), int(gt[f, 1])), (int(gt[f, 0] + gt[f, 2]), int(gt[f, 1] + gt[f, 3])), (0, 255, 0), 3) else: location = [int(l) for l in location] # cv2.rectangle( im, (location[0], location[1]), (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3) cv2.putText(im, '#' + str(f), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video['name'], im) cv2.waitKey(1) else: with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() if p_bbox[0] < 0: p_bbox[0] = 0 if p_bbox[1] < 0: p_bbox[1] = 0 fin.write(','.join([str(i) for i in p_bbox]) + '\n') else: for x in regions: p_bbox = x.copy() if p_bbox[0] < 0: p_bbox[0] = 1 if p_bbox[1] < 0: p_bbox[1] = 1 fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format( video['name'], toc, f / toc))