def initialize(self, image_file, box): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training cx, cy, w, h = get_axis_aligned_bbox(box) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) self.state = self.siam_tracker.init( im, target_pos, target_sz, self.siam_net ) # init tracker
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_step_{:.4f}'.format(hp['scale_step']) + '_penalty_s_{:.4f}'.format(hp['scale_penalty']) + '_w_influence_{:.4f}'.format(hp['w_influence']) + '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace('.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: return tracker_path start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(location) with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') return tracker_path
def track(tracker, net, video, args): start_frame, lost_times, toc = 0, 0, 0 # save result to evaluate if args.epoch_test: suffix = args.resume.split('/')[-1] suffix = suffix.split('.')[0] tracker_path = os.path.join('result', args.dataset, args.arch + suffix) else: tracker_path = os.path.join('result', args.dataset, args.arch) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return 0 # for mult-gputesting regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net) # init tracker regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap > 0: regions.append(location) else: regions.append(2) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append(0) toc += cv2.getTickCount() - tic toc /= cv2.getTickFrequency() with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') else: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format( video['name'], toc, f / toc, lost_times)) return lost_times
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config[ 'hp'] # penalty_k, scale_lr, window_influence, adaptive size (for vot2017 or later) tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_small_size_{:.4f}'.format(hp['small_sz']) + '_big_size_{:.4f}'.format(hp['big_sz']) + '_penalty_k_{:.4f}'.format(hp['penalty_k']) + '_w_influence_{:.4f}'.format(hp['window_influence']) + '_scale_lr_{:.4f}'.format(hp['lr'])).replace( '.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') else: raise ValueError('Only VOT is supported') # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('VOT'): return 0 else: raise ValueError('Only VOT is supported') start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 if b_overlap > 0: regions.append(location) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if benchmark_name.startswith('VOT'): return regions else: raise ValueError('Only VOT is supported')
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_step_{:.4f}'.format(hp['scale_step']) + '_penalty_s_{:.4f}'.format(hp['scale_penalty']) + '_w_influence_{:.4f}'.format(hp['w_influence']) + '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace( '.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') elif 'GOT10K' in benchmark_name: re_video_path = os.path.join(tracker_path, video['name']) if not exists(re_video_path): os.makedirs(re_video_path) result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name'])) else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('OTB'): return tracker_path elif benchmark_name.startswith('VOT') or benchmark_name.startswith( 'GOT10K'): return 0 else: print('benchmark not supported now') return start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(im, target_pos, target_sz, net, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 if b_overlap > 0: regions.append(location) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if 'OTB' in benchmark_name or 'LASOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name: return tracker_path else: print('benchmark not supported now')
def track(siam_tracker, online_tracker, siam_net, video, args): start_frame, toc = 0, 0 # save result to evaluate if args.epoch_test: suffix = args.resume.split('/')[-1] suffix = suffix.split('.')[0] tracker_path = os.path.join('result', args.dataset, args.arch + suffix) else: tracker_path = os.path.join('result', args.dataset, args.arch) if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return # for mult-gputesting regions = [] lost = 0 image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = siam_tracker.init(im, target_pos, target_sz, siam_net) # init tracker if args.online: online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume) # location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append(1 if 'VOT' in args.dataset else gt[f]) elif f > start_frame: # tracking if args.online: state = online_tracker.track(im, rgb_im, siam_tracker, state) else: state = siam_tracker.track(state, im) location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 if b_overlap > 0: regions.append(location) else: regions.append(2) start_frame = f + 5 lost += 1 else: regions.append(0) toc += cv2.getTickCount() - tic with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') elif 'OTB' in args.dataset or 'LASOT' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([ str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox) ]) + '\n') elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset or 'TN' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost {}'.format( video['name'], toc, f / toc, lost))
def main(): net = models.__dict__[args.arch](anchors_nums=args.anchor_nums, cls_type=args.cls_type) net = load_pretrain(net, args.resume) net.eval() net = net.cuda() # prepare tracker info = edict() info.arch = args.arch info.cls_type = args.cls_type info.dataset = args.dataset info.epoch_test = args.epoch_test tracker = SiamRPN(info) dataset_root = os.path.join("/ssd", args.dataset) dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.resume.split('/')[-1].split('.')[0] total_lost = 0 """ eao will lower than origin version(0.393->0.390) due to the number of digits after the decimal point """ if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): # if len(gt_bbox) == 4: # gt_bbox = [gt_bbox[0], gt_bbox[1], # gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1, # gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]] tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) #pred_bbox = gt_bbox_ pred_bboxes.append(1) elif idx > frame_counter: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) #outputs = tracker.track(img) pred_bbox = location #overlap=poly_iou(gt_bbox,location) overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: cv2.polylines( img, [np.array(gt_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 0), 3) if cfg.MASK.MASK: cv2.polylines( img, [np.array(pred_bbox, np.int).reshape( (-1, 1, 2))], True, (0, 255, 255), 3) else: bbox = list(map(int, pred_bbox)) cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results video_path = os.path.join('results', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] scores = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h] target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) state = tracker.init(img, target_pos, target_sz, net) # init tracker state["arch"] = args.arch #tracker.init(img, gt_bbox_) pred_bbox = gt_bbox_ scores.append(None) if 'VOT2018-LT' == args.dataset: pred_bboxes.append([1]) else: pred_bboxes.append(pred_bbox) else: state = tracker.track(state, img) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bbox = location #outputs = tracker.track(img) #pred_bbox = outputs['bbox'] pred_bboxes.append(pred_bbox) scores.append(state['score']) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: gt_bbox = list(map(int, gt_bbox)) pred_bbox = list(map(int, pred_bbox)) cv2.rectangle( img, (gt_bbox[0], gt_bbox[1]), (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3) cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]), (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 255, 255), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow(video.name, img) cv2.waitKey(1) toc /= cv2.getTickFrequency() # save results if 'VOT2018-LT' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, 'longterm', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join( video_path, '{}_001_confidence.value'.format(video.name)) with open(result_path, 'w') as f: for x in scores: f.write('\n') if x is None else f.write( "{:.6f}\n".format(x)) result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) elif 'GOT-10k' == args.dataset: video_path = os.path.join('results', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def main(): # load config cfg_from_file(args.config) dataset_root = os.path.join('dataset', args.dataset) # create model net = ModelBuilder() checkpoint = torch.load(args.model) if 'state_dict' in checkpoint: net.load_state_dict(checkpoint['state_dict']) else: net.load_state_dict(checkpoint) net.cuda().eval() # create dataset dataset = DatasetFactory.create_dataset(name=args.dataset, dataset_root=dataset_root, load_img=False) model_name = args.save_name total_lost = 0 if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']: # restart tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue frame_counter = 0 lost_number = 0 toc = 0 pred_bboxes = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == frame_counter: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(1) elif idx > frame_counter: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_polygon = [ pred_bbox[0], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1], pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3], pred_bbox[0], pred_bbox[1] + pred_bbox[3] ] overlap = vot_overlap(gt_bbox, pred_polygon, (img.shape[1], img.shape[0])) if overlap > 0: # not lost pred_bboxes.append(pred_bbox) else: # lost object pred_bboxes.append(2) frame_counter = idx + 5 # skip 5 frames lost_number += 1 else: pred_bboxes.append(0) toc += cv2.getTickCount() - tic if idx == 0: cv2.destroyAllWindows() if args.vis and idx > frame_counter: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() # save results video_path = os.path.join('result', args.dataset, model_name, 'baseline', video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: if isinstance(x, int): f.write("{:d}\n".format(x)) else: f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n') print( '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}' .format(v_idx + 1, video.name, toc, idx / toc, lost_number)) total_lost += lost_number print("{:s} total lost: {:d}".format(model_name, total_lost)) else: # OPE tracking for v_idx, video in enumerate(dataset): if args.video != '': # test one special video if video.name != args.video: continue toc = 0 pred_bboxes = [] track_times = [] for idx, (img, gt_bbox) in enumerate(video): tic = cv2.getTickCount() if idx == 0: if 'OTB' in args.dataset: target_pos, target_sz = rect1_2_cxy_wh(gt_bbox) else: cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) target_pos, target_sz = np.array([cx, cy ]), np.array([w, h]) state = CGACD_init(img, target_pos, target_sz, net) if 'OTB' in args.dataset: pred_bbox = cxy_wh_2_rect1(state['target_pos'], state['target_sz']) else: pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) else: state = CGACD_track(state, img) pred_bbox = cxy_wh_2_rect(state['target_pos'], state['target_sz']) pred_bboxes.append(pred_bbox) toc += cv2.getTickCount() - tic track_times.append( (cv2.getTickCount() - tic) / cv2.getTickFrequency()) if idx == 0: cv2.destroyAllWindows() if args.vis and idx > 0: target_pos = state['target_pos'] target_sz = state['target_sz'] cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2), int(target_pos[1] - target_sz[1] / 2)), (int(target_pos[0] + target_sz[0] / 2), int(target_pos[1] + target_sz[1] / 2)), (0, 255, 0), 3) cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2, cv2.LINE_AA) cv2.imshow(video.name, img) cv2.moveWindow(video.name, 100, 100) key = cv2.waitKey(1) if key == 27: break toc /= cv2.getTickFrequency() if 'GOT-10k' == args.dataset: video_path = os.path.join('result', args.dataset, model_name, video.name) if not os.path.isdir(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, '{}_001.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') result_path = os.path.join(video_path, '{}_time.txt'.format(video.name)) with open(result_path, 'w') as f: for x in track_times: f.write("{:.6f}\n".format(x)) else: model_path = os.path.join('result', args.dataset, model_name) if not os.path.isdir(model_path): os.makedirs(model_path) result_path = os.path.join(model_path, '{}.txt'.format(video.name)) with open(result_path, 'w') as f: for x in pred_bboxes: f.write(','.join([str(i) for i in x]) + '\n') print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'. format(v_idx + 1, video.name, toc, idx / toc))
def track_tune(tracker, net, video, config): arch = config['arch'] benchmark_name = config['benchmark'] resume = config['resume'] hp = config['hp'] # scale_step, scale_penalty, scale_lr, window_influence tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] + '_small_size_{:.4f}'.format(hp['small_sz']) + '_big_size_{:.4f}'.format(hp['big_sz']) + '_lambda_u_{:.4f}'.format(hp['choose_thr']) + '_lambda_s_{:.4f}'.format(hp['choose_thr']) + '_cyclic_thr_{:.4f}'.format(hp['choose_thr']) + '_choose_thr_{:.4f}'.format(hp['choose_thr']) + '_penalty_k_{:.4f}'.format(hp['penalty_k']) + '_w_influence_{:.4f}'.format(hp['window_influence']) + '_scale_lr_{:.4f}'.format(hp['lr'])).replace('.', '_')) # no . if not os.path.exists(tracker_path): os.makedirs(tracker_path) if 'VOT' in benchmark_name: baseline_path = join(tracker_path, 'baseline') video_path = join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = join(video_path, video['name'] + '_001.txt') elif 'GOT10K' in benchmark_name: re_video_path = os.path.join(tracker_path, video['name']) if not exists(re_video_path): os.makedirs(re_video_path) result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name'])) else: result_path = join(tracker_path, '{:s}.txt'.format(video['name'])) # occ for parallel running if not os.path.exists(result_path): fin = open(result_path, 'w') fin.close() else: if benchmark_name.startswith('OTB'): return tracker_path elif benchmark_name.startswith('VOT') or benchmark_name.startswith('GOT10K'): return 0 else: print('benchmark not supported now') return start_frame, lost_times, toc = 0, 0, 0 regions = [] # result and states[1 init / 2 lost / 0 skip] # for rgbt splited test image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) mask_gt = None state = tracker.init(im, target_pos, target_sz, net, online=False, mask=mask_gt, debug=False, hp=hp) # init tracker location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) regions.append([float(1)] if 'VOT' in benchmark_name else gt[f]) elif f > start_frame: # tracking state = tracker.track(state, im) # track location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1 polygon = state['polygon'] if not polygon is None: polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]] polygon = np.array(polygon) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']: record = polygon else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) record = polygon if not 'VOT' in benchmark_name: # change polygon to [x, y, w, h] x1, y1, x2, y2 = record[0], record[1], record[4], record[5] record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1]) if b_overlap > 0: regions.append(record) else: regions.append([float(2)]) lost_times += 1 start_frame = f + 5 # skip 5 frames else: # skip regions.append([float(0)]) # save results for OTB if 'OTB' in benchmark_name or 'LASOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name: with open(result_path, "w") as fin: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VOT' in benchmark_name: with open(result_path, "w") as fin: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name: return tracker_path else: print('benchmark not supported now')
def track_box(siam_tracker, online_tracker, siam_net, video, args): """ track a benchmark with only box annoated attention: not for benchmark evaluation, just a demo """ tracker_path = os.path.join('result', args.dataset, args.arch) if 'VOT' in args.dataset: baseline_path = os.path.join(tracker_path, 'baseline') video_path = os.path.join(baseline_path, video['name']) if not os.path.exists(video_path): os.makedirs(video_path) result_path = os.path.join(video_path, video['name'] + '_001.txt') else: result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name'])) if os.path.exists(result_path): return # for mult-gputesting regions = [] b_overlaps, b_overlaps2, b_overlaps3 = [], [], [] lost = 0 start_frame, toc = 0, 0 image_files, gt = video['image_files'], video['gt'] for f, image_file in enumerate(image_files): im = cv2.imread(image_file) if args.online: rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR) # align with training tic = cv2.getTickCount() if f == start_frame: # init cx, cy, w, h = get_axis_aligned_bbox(gt[f]) target_pos = np.array([cx, cy]) target_sz = np.array([w, h]) mask_gt = None state = siam_tracker.init(im, target_pos, target_sz, siam_net, online=args.online, mask=mask_gt, debug=args.debug) # init siamese tracker if args.online: online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume) elif f > start_frame: # tracking if args.online: state = online_tracker.track(im, rgb_im, siam_tracker, state) else: state = siam_tracker.track(state, im, name=image_file) mask = state['mask'] location = cxy_wh_2_rect(state['target_pos'], state['target_sz']) b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1 polygon = state['polygon'] if not polygon is None: polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]] polygon = np.array(polygon) # b_overlap2 = poly_iou(gt[f], polygon) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']: record = polygon # b_overlaps3.append(b_overlap2) else: x1, y1, w, h = location x2, y2 = x1 + w, y1 + h polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2]) record = polygon # b_overlaps3.append(b_overlap) # print('b_overlap: {}, b_overlap2: {}'.format(b_overlap, b_overlap2)) # b_overlaps.append(b_overlap) # b_overlaps2.append(b_overlap2) if not 'VOT' in benchmark_name: # change polygon to [x, y, w, h] x1, y1, x2, y2 = record[0], record[1], record[4], record[5] record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1]) if b_overlap > 0: regions.append(record) else: regions.append(2) start_frame = f + 5 lost += 1 if args.vis: COLORS = np.random.randint(128, 255, size=(1, 3), dtype="uint8") COLORS = np.vstack([[0, 0, 0], COLORS]).astype("uint8") mask = COLORS[mask] output = ((0.4 * im) + (0.6 * mask)).astype("uint8") cv2.imshow("mask", output) cv2.waitKey(1) toc += cv2.getTickCount() - tic # print('b_overlap: {}, b_overlap2: {}, b_overlap3: {}'.format(np.array(b_overlaps).mean(), np.array(b_overlaps2).mean(), np.array(b_overlaps3).mean())) with open(result_path, "w") as fin: if 'VOT' in args.dataset: for x in regions: if isinstance(x, int): fin.write("{:d}\n".format(x)) else: p_bbox = x.copy() fin.write(','.join([str(i) for i in p_bbox]) + '\n') elif 'OTB' in args.dataset or 'LASOT' in args.dataset: for x in regions: p_bbox = x.copy() fin.write( ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n') elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset: for x in regions: p_bbox = x.copy() fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n') toc /= cv2.getTickFrequency() print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(video['name'], toc, f / toc))