def track(self, im): for i in range(self.config.num_scale): # crop multi-scale search region window_sz = self.target_sz * (self.config.scale_factor[i] * (1 + self.config.padding)) bbox = cxy_wh_2_bbox(self.target_pos, window_sz) self.patch_crop[i, :] = crop_chw(im, bbox, self.config.crop_sz) search = self.patch_crop - self.config.net_average_image if self.gpu: response = self.net(torch.Tensor(search).cuda()) else: response = self.net(torch.Tensor(search)) peak, idx = torch.max(response.view(self.config.num_scale, -1), 1) peak = peak.data.cpu().numpy() * self.config.scale_penalties best_scale = np.argmax(peak) r_max, c_max = np.unravel_index(idx[best_scale], self.config.net_input_size) if r_max > self.config.net_input_size[0] / 2: r_max = r_max - self.config.net_input_size[0] if c_max > self.config.net_input_size[1] / 2: c_max = c_max - self.config.net_input_size[1] window_sz = self.target_sz * (self.config.scale_factor[best_scale] * (1 + self.config.padding)) self.target_pos = self.target_pos + np.array([c_max, r_max]) * window_sz / self.config.net_input_size self.target_sz = np.minimum(np.maximum(window_sz / (1 + self.config.padding), self.min_sz), self.max_sz) # model update window_sz = self.target_sz * (1 + self.config.padding) bbox = cxy_wh_2_bbox(self.target_pos, window_sz) patch = crop_chw(im, bbox, self.config.crop_sz) target = patch - self.config.net_average_image self.net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda(), lr=self.config.interp_factor) return cxy_wh_2_rect1(self.target_pos, self.target_sz) # 1-index
def mouse_opt(event, x, y, flags, param): global tmp_p_1, tmp_p_2, target_pos, target_sz, n_reset, is_visible, res, is_labeling if event == cv2.EVENT_LBUTTONDOWN: if not is_labeling: tmp_p_1 = (min(max(0, x), half_image_w if image_w > 1080 else image_w), min(max(0, y), half_image_h if image_w > 1080 else image_h)) print("mouse on click at (%d, %d)" % (x, y)) is_labeling = True else: # tmp_p_1 = (x, y) # elif event == cv2.EVENT_LBUTTONUP: tmp_p_2 = (min(max(0, x), half_image_w if image_w > 1080 else image_w), min(max(0, y), half_image_h if image_w > 1080 else image_h)) # tmp_p_2 = (x, y) n_reset += 1 # update target bounding box x_c = (tmp_p_1[0] + tmp_p_2[0]) / 2.0 y_c = (tmp_p_1[1] + tmp_p_2[1]) / 2.0 w = float(abs(tmp_p_2[0] - tmp_p_1[0])) h = float(abs(tmp_p_2[1] - tmp_p_1[1])) target_pos = np.array([x_c, y_c]) target_sz = np.array([w, h]) # try: # res[n_frame] = np.concatenate((cxy_wh_2_bbox(target_pos, target_sz), np.array([is_visible]))) # except Exception as e: # print(e) # save_results(res, result_path, image_w, image_h) # print("save file in %s" % result_path) # sys.exit() if len(res) == 0: print("no record found, append data") res.append(np.concatenate((cxy_wh_2_bbox(target_pos, target_sz), np.array([is_visible])))) else: res[n_frame-1] = np.concatenate((cxy_wh_2_bbox(target_pos, target_sz), np.array([is_visible]))) # update template update_template() is_visible = 1 is_labeling = False elif event == cv2.EVENT_MOUSEMOVE: if is_labeling: tmp_p_2 = (min(max(0, x), half_image_w if image_w > 1080 else image_w), min(max(0, y), half_image_h if image_w > 1080 else image_h)) # update target bounding box x_c = (tmp_p_1[0] + tmp_p_2[0]) / 2.0 y_c = (tmp_p_1[1] + tmp_p_2[1]) / 2.0 w = float(abs(tmp_p_2[0] - tmp_p_1[0])) h = float(abs(tmp_p_2[1] - tmp_p_1[1])) target_pos = np.array([x_c, y_c]) target_sz = np.array([w, h]) is_visible = 1 draw() cv2.imshow('video', im_show) cv2.waitKey(1)
def __init__(self, im, init_rect, config=TrackerConfig(), gpu=True): self.gpu = gpu self.config = config self.net = DCFNet(config) self.net.load_param(config.feature_path) self.net.eval() if gpu: self.net.cuda() # confine results target_pos, target_sz = rect1_2_cxy_wh(init_rect) self.min_sz = np.maximum(config.min_scale_factor * target_sz, 4) self.max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz) # crop template window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, self.config.crop_sz) target = patch - config.net_average_image self.net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda()) self.target_pos, self.target_sz = target_pos, target_sz self.patch_crop = np.zeros( (config.num_scale, patch.shape[0], patch.shape[1], patch.shape[2]), np.float32) # buff
def update_template(): # confine results min_sz = np.maximum(config.min_scale_factor * target_sz, 4) max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz) window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, config.crop_sz) target = patch - config.net_average_image net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda()) patch_crop = np.zeros((config.num_scale, patch.shape[0], patch.shape[1], patch.shape[2]), np.float32)
def preprocess_patch(im, target_pos, target_sz, config): """ preprocess patch for the input of network :param im: input image (h, w, 3) :param target_pos: center coordinate of the target box, tuple (cx, cy) :param target_sz: width and hight of the target box, tuple (w, h) :param config: TrackConfig :return: patch: (3, config.crop_sz, config.crop_sz) """ # enlarge the cropping scale window_sz = target_sz * (1 + config.padding) # change the expression of box coordinate bbox = cxy_wh_2_bbox(target_pos, window_sz) # crop z in the next frame patch = crop_chw(im, bbox, config.crop_sz) # mean deduction patch = patch - config.net_average_image return patch
n_images = len(image_files) tic = time.time() # time start target_pos, target_sz = rect1_2_cxy_wh( init_rect) # OTB label is 1-indexed im = cv2.imread(image_files[0]) # HxWxC # confine results min_sz = np.maximum(config.min_scale_factor * target_sz, 4) max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz) # crop template window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, config.crop_sz) target = patch - config.net_average_image net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda()) res = [cxy_wh_2_rect1(target_pos, target_sz)] # save in .txt patch_crop = np.zeros( (config.num_scale, patch.shape[0], patch.shape[1], patch.shape[2]), np.float32) for f in range(1, n_images): # track im = cv2.imread(image_files[f]) for i in range(config.num_scale): # crop multi-scale search region window_sz = target_sz * (config.scale_factor[i] * (1 + config.padding))
cv2.setMouseCallback('video', mouse_opt) key_first_frame = cv2.waitKey(1) # if cv2.getWindowProperty('video', 1) == -1: # can't work in MobaXterm # sys.exit() # press space and first frame bounding box is choosed by user, then continue the video loop if key_first_frame == KEY_PAUSE: if target_pos is not None: is_visible = 1 # confine results min_sz = np.maximum(config.min_scale_factor * target_sz, 4) max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz) # crop template window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, config.crop_sz) target = patch - config.net_average_image net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda()) patch_crop = np.zeros((config.num_scale, patch.shape[0], patch.shape[1], patch.shape[2]), np.float32) else: is_visible = 0 if len(res) > 0: res = res[:n_frame] res.append(np.concatenate((cxy_wh_2_bbox(target_pos, target_sz), np.array([is_visible])))) # save in .txt start_from_half = False break # previous frame elif key_first_frame == KEY_LEFT: