def draw_data(self): disp_image = self.raw_data[0].copy() resize_factor = 1 if max(disp_image.shape) > 480: resize_factor = 480.0 / float(max(disp_image.shape)) disp_image = cv2.resize(disp_image, None, fx=resize_factor, fy=resize_factor) for i, mask in enumerate(self.raw_data[2]): self.raw_data[2][i] = cv2.resize(mask, None, fx=resize_factor, fy=resize_factor) boxes = [resize_factor * b.clone() for b in self.raw_data[1]] for i, disp_rect in enumerate(boxes): color = ((255 * ((i % 3) > 0)), 255 * ((i + 1) % 2), (255 * (i % 5)) // 4) cv2.rectangle(disp_image, (int(disp_rect[0]), int(disp_rect[1])), (int(disp_rect[0] + disp_rect[2]), int(disp_rect[1] + disp_rect[3])), color, 2) for i, mask in enumerate(self.raw_data[2], 1): disp_image = overlay_mask(disp_image, mask * i) disp_image = numpy_to_torch(disp_image).squeeze(0) disp_image = disp_image.float() self.visdom.image(disp_image, opts={'title': self.title}, win=self.title)
def track(self, image): self.frame_num += 1 # Convert image im = numpy_to_torch(image) # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_xf = self.extract_fourier_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores sf = self.apply_filter(test_xf) translation_vec, scale_ind, s = self.localize_target(sf) scale_change_factor = self.params.scale_factors[scale_ind] # Update position and scale self.update_state(sample_pos + translation_vec, self.target_scale * scale_change_factor) self.predict_target_box(sample_pos, sample_scales[scale_ind], scale_ind) if self.params.debug >= 2: show_tensor(s[scale_ind, ...], 5) if self.params.debug >= 3: for i, hf in enumerate(self.filter): show_tensor(fourier.sample_fs(hf).abs().mean(1), 6 + i) # ------- UPDATE ------- # # Get train sample train_xf = TensorList( [xf[scale_ind:scale_ind + 1, ...] for xf in test_xf]) # Shift the sample shift_samp = 2 * math.pi * (self.pos - sample_pos) / ( sample_scales[scale_ind] * self.img_support_sz) train_xf = fourier.shift_fs(train_xf, shift=shift_samp) # Update memory self.update_memory(train_xf) # Train filter if self.frame_num % self.params.train_skipping == 1: self.filter_optimizer.run(self.params.CG_iter, train_xf) self.symmetrize_filter() # Return new state new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) return new_state.tolist()
def initialize(self, image, info: dict) -> dict: # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize network self.initialize_features() # The DiMP network self.net = self.params.net # Time initialization tic = time.time() # Get target position and size state = info['init_bbox'] self.pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set sizes sz = self.params.image_sample_size self.img_sample_sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz) self.img_support_sz = self.img_sample_sz # Set search area search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() self.target_scale = math.sqrt(search_area) / self.img_sample_sz.prod().sqrt() # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Convert image im = numpy_to_torch(image) # Setup scale factors if not hasattr(self.params, 'scale_factors'): self.params.scale_factors = torch.ones(1) elif isinstance(self.params.scale_factors, (list, tuple)): self.params.scale_factors = torch.Tensor(self.params.scale_factors) # Setup scale bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample init_backbone_feat = self.generate_init_samples(im) # Initialize classifier self.init_classifier(init_backbone_feat) # Initialize IoUNet if getattr(self.params, 'use_iou_net', True): self.init_iou_net(init_backbone_feat) out = {'time': time.time() - tic} return out
def __call__(self, image, is_mask=False): if isinstance(image, torch.Tensor): return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image)))) else: c = (np.expand_dims(np.array(image.shape[:2]),1)-1)/2 R = np.array([[math.cos(self.angle), math.sin(self.angle)], [-math.sin(self.angle), math.cos(self.angle)]]) H =np.concatenate([R, c - R @ c], 1) return cv.warpAffine(image, H, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
def setting_adaptive_search_region_using_speed(self, im): """ reinitialze search region scale for next frame """ self.atom.target_scale = 1.0 search_area = torch.prod(self.atom.target_sz * self.atom.params.search_area_scale).item() if search_area > self.atom.params.max_image_sample_size: self.atom.target_scale = math.sqrt(search_area / self.atom.params.max_image_sample_size) elif search_area < self.atom.params.min_image_sample_size: self.atom.target_scale = math.sqrt(search_area / self.atom.params.min_image_sample_size) # Target size in base scale self.atom.base_target_sz = self.atom.target_sz / self.atom.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.atom.params.features.stride()) if getattr(self.atom.params, 'search_area_shape', 'square') == 'square': self.atom.img_sample_sz = torch.round( torch.sqrt(torch.prod(self.atom.base_target_sz * self.atom.params.search_area_scale))) * torch.ones(2) elif self.atom.params.search_area_shape == 'initrect': # 选的非正方形 self.atom.img_sample_sz = torch.round(self.atom.base_target_sz * self.atom.params.search_area_scale) else: raise ValueError('Unknown search area shape') if self.atom.params.feature_size_odd: self.atom.img_sample_sz += feat_max_stride - self.atom.img_sample_sz % (2 * feat_max_stride) else: self.atom.img_sample_sz += feat_max_stride - (self.atom.img_sample_sz + feat_max_stride) % ( 2 * feat_max_stride) # Set sizes self.atom.img_support_sz = self.atom.img_sample_sz self.atom.feature_sz = self.atom.params.features.size(self.atom.img_sample_sz) self.atom.output_sz = self.atom.params.score_upsample_factor * self.atom.img_support_sz # Interpolated size of the output self.atom.iou_img_sample_sz = self.atom.img_sample_sz # Setup scale bounds im = numpy_to_torch(im) self.atom.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.atom.min_scale_factor = torch.max(10 / self.atom.base_target_sz) self.atom.max_scale_factor = torch.min(self.atom.image_sz / self.atom.base_target_sz) self.atom.output_window = None if getattr(self.params, 'window_output', False): if getattr(self.params, 'use_clipped_window', False): self.atom.output_window = dcf.hann2d_clipped(self.atom.output_sz.long(), self.atom.output_sz.long() * self.params.effective_search_area / self.params.search_area_scale, centered=False).to(self.params.device) else: self.atom.output_window = dcf.hann2d(self.atom.output_sz.long(), centered=False).to(self.params.device)
def locate(self, image): # Convert image im = numpy_to_torch(image) self.local_Tracker.im = im # For debugging only # ------- LOCALIZATION ------- # # Get sample sample_pos = self.local_Tracker.pos.round() sample_scales = self.local_Tracker.target_scale * self.local_Tracker.params.scale_factors test_x = self.local_Tracker.extract_processed_sample(im, self.local_Tracker.pos, sample_scales, self.local_Tracker.img_sample_sz) # Compute scores scores_raw = self.local_Tracker.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.local_Tracker.localize_target(scores_raw) return translation_vec, scale_ind, s, flag, sample_pos, sample_scales, test_x
def track(self, image, update=False) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) # ------- LOCALIZATION ------- # # Get sample test_xf = self.extract_fourier_sample(im) # Compute scores sfs = self.apply_filters(test_xf) out = TensorList([ self.localize_and_update_target(sfs[i], i) for i in range(len(self.points)) ]) return out
def __call__(self, image, is_mask=False): input_tensor = torch.is_tensor(image) if input_tensor: image = torch_to_numpy(image) do_flip, theta, shear_values, scale_factors = self.roll_values t_mat = self._construct_t_mat(image.shape[:2], do_flip, theta, shear_values, scale_factors) output_sz = (image.shape[1] + 2*self.pad_amount, image.shape[0] + 2*self.pad_amount) if not is_mask: image_t = cv.warpAffine(image, t_mat, output_sz, flags=cv.INTER_LINEAR, borderMode=self.border_flag) else: image_t = cv.warpAffine(image, t_mat, output_sz, flags=cv.INTER_NEAREST, borderMode=self.border_flag) image_t = image_t.reshape(image.shape) if input_tensor: image_t = numpy_to_torch(image_t) return self.crop_to_output(image_t)
def draw_data(self): disp_image = self.raw_data[0].copy() box = self.raw_data[1].clone() if max(disp_image.shape) > 480: resize_factor = 480.0 / float(max(disp_image.shape)) disp_image = cv2.resize(disp_image, None, fx=resize_factor, fy=resize_factor) disp_rect = box * resize_factor else: disp_rect = box cv2.rectangle(disp_image, (int(disp_rect[0]), int(disp_rect[1])), (int(disp_rect[0] + disp_rect[2]), int(disp_rect[1] + disp_rect[3])), (0, 255, 0), 2) disp_image = numpy_to_torch(disp_image).squeeze(0) disp_image = disp_image.float() self.visdom.image(disp_image, opts={'title': self.title}, win=self.title)
def initialize(self, image, state, *args, **kwargs): self.frame_num = 1 if self.params.output_image: # Make if not os.path.exists(self.params.output_image_path): os.mkdir(self.params.output_image_path) NUM = len(os.listdir(self.params.output_image_path)) self.params.output_image_path = os.path.join(self.params.output_image_path, "%d"%(NUM+1)) os.mkdir(self.params.output_image_path) # For debugging and display only image_show = image.copy() # For debugging torch.set_printoptions(threshold=20000) # Fix random seed np.random.seed(1024) torch.manual_seed(1024) torch.cuda.manual_seed_all(1024) # HEIGHT and WIDTH self.IMG_HEIGHT, self.IMG_WIDTH = image.shape[0], image.shape[1] # Initialize tracking model self.params.model.initialize() # Get target position and target size (y, x, h, w) (state = [xt, yt, w, h]) self.target_pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2]) self.target_sz = torch.Tensor([state[3], state[2]]) self.initial_target_sz = self.target_sz.clone() # Set sample size and target area of search region (N) self.img_sample_sz = torch.Tensor([math.sqrt(self.params.img_sample_area)]) * torch.ones(2) self.target_sample_area = self.params.img_sample_area / self.params.search_padding**2 # Get sampling area, sampling ratio and target size self.search_area = torch.prod(self.target_sz * self.params.search_padding) self.sample_scale = torch.sqrt(self.search_area / self.params.img_sample_area) self.target_sample_sz = self.target_sz / self.sample_scale # Initialize centers of proposals for locator (N) self.locator_proposals_xc, self.locator_proposals_yc, self.locator_labels = self.init_locator_proposals_center_function() self.locator_proposals = torch.zeros(1, self.locator_labels.shape[0], 4, device=self.params.device) assert(self.locator_labels.max().item()==1.0) # Creat output score window (N) self.output_window = None if getattr(self.params, 'window_output', True): self.output_window = self.init_output_window_function() # Extract transform samples im_tensor = numpy_to_torch(image) train_samples = self.generate_init_samples(im_tensor, self.target_pos, self.sample_scale) train_samples = train_samples.cuda() # Setup scale bounds self.image_sz = torch.Tensor([self.IMG_HEIGHT, self.IMG_WIDTH]) self.min_scale_factor = torch.max(10 / self.target_sz) self.max_scale_factor = torch.min(self.image_sz / self.target_sz) # Generate locator proposals batch_size = train_samples.shape[0] locator_proposals = self.get_locator_proposals(self.target_sample_sz) locator_proposals = locator_proposals.repeat(batch_size,1,1) # Extract backbone features backbone_features = self.params.model.extract_backbone_features(train_samples) # Extract target iounet features self.target_iou_feat = self.init_iou_net(self.target_pos, self.target_sz, self.sample_scale, backbone_features) # Extract locator features and train locator model train_locator_features = self.params.model.extract_locator_features(backbone_features, locator_proposals) self.locator_XTX = torch.matmul(train_locator_features.permute(0,2,1), train_locator_features).mean(dim=0) self.locator_XTY = torch.matmul(train_locator_features.permute(0,2,1), self.locator_labels).mean(dim=0) self.locator_regularization = self.params.regularization * torch.eye(self.locator_XTX.shape[1], device=self.params.device) self.locator_model = self.train_locator_model(self.locator_XTX+self.locator_regularization, self.locator_XTY) # Save initial locator feature model self.locator_XTX_initial = self.locator_XTX.clone() self.locator_XTY_initial = self.locator_XTY.clone() # Initialize the detect region of hard negative samples self.hard_negative_region_mask = self.init_hard_negative_region_function() # Initialize the weight of first frame self.current_initial_frame_weight = 1.0 # Output result image if self.params.output_image: self.output_result_image(image_show, state)
def initialize(self, image, info: dict) -> dict: # Initialize some stuff self.frame_num = 1 if not self.params.has('device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize network self.initialize_features() # The DiMP network self.net = self.params.net # Time initialization tic = time.time() state = info['init_bbox'] # Get Target layer target_depth = get_target_depth(image, state) # Get Target layer target_depth = get_target_depth(image, state) print(target_depth) target_layer = get_layered_image_by_depth(image, target_depth) self.layer_id = int(target_depth // 2000) print('layer id : ', self.layer_id) # Convert image # im = numpy_to_torch(image) # HxWx6 -> 6 * H * W im = numpy_to_torch(target_layer) # Get target position and size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Get object id self.object_id = info.get('object_ids', [None])[0] self.id_str = '' if self.object_id is None else ' {}'.format( self.object_id) # Set sizes self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) sz = self.params.image_sample_size sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz) if self.params.get('use_image_aspect_ratio', False): sz = self.image_sz * sz.prod().sqrt() / self.image_sz.prod().sqrt() stride = self.params.get('feature_stride', 32) sz = torch.round(sz / stride) * stride self.img_sample_sz = sz self.img_support_sz = self.img_sample_sz # Set search area search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() self.target_scale = math.sqrt( search_area) / self.img_sample_sz.prod().sqrt() # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Setup scale factors if not self.params.has('scale_factors'): self.params.scale_factors = torch.ones(1) elif isinstance(self.params.scale_factors, (list, tuple)): self.params.scale_factors = torch.Tensor(self.params.scale_factors) # Setup scale bounds self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample init_backbone_feat = self.generate_init_samples(im) # Initialize classifier self.init_classifier(init_backbone_feat) # Initialize IoUNet if self.params.get('use_iou_net', True): self.init_iou_net(init_backbone_feat) out = {'time': time.time() - tic} return out
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # track on each layer # 0-2 m 2-4m, 4-6m , 6-8m, 8-10m, 10-inf max_score = 0 flag = 'not_found' new_pos = [-1, -1, -1, -1] scale_ind = None backbone_feat = None test_x = None sample_pos = None s = None sample_coords = None target_dist = 0 final_layer = image print(np.max(image)) start = max(0, self.layer_id - 1) end = min(11, self.layer_id + 2) for z_dist in range(start, end): if z_dist == 10: lower = 10000 # 10 meter upper = np.max(image) else: lower = z_dist * 1000 upper = (z_dist + 2) * 1000 layer = image.copy() layer[layer > upper] = 0 layer[layer < lower] = 0 print(lower, upper, np.median(np.nonzero(layer))) layer = (layer - lower) / (upper - lower) layer = np.asarray(layer * 255, dtype=np.uint8) layer = cv2.applyColorMap(layer, cv2.COLORMAP_JET) layer = numpy_to_torch(layer) # Extract backbone features backbone_feat_layer, sample_coords_layer, im_patches_layer = self.extract_backbone_features( layer, self.get_centered_sample_pos(), self.target_scale * self.params.scale_factors, self.img_sample_sz) # Extract classification features test_x_layer = self.get_classification_features( backbone_feat_layer) # Location of sample sample_pos_layer, sample_scales_layer = self.get_sample_location( sample_coords_layer) # Compute classification scores scores_raw_layer = self.classify_target(test_x_layer) # Localize the target translation_vec_layer, scale_ind_layer, s_layer, flag_layer = self.localize_target( scores_raw_layer, sample_pos_layer, sample_scales_layer) # Song Here can add depth cues new_pos_layer = sample_pos_layer[ scale_ind_layer, :] + translation_vec_layer score_map_layer = s_layer[scale_ind_layer, ...] max_score_layer = torch.max(score_map_layer).item() if flag_layer != 'not_found' and max_score_layer > max_score: flag = flag_layer max_score = max_score_layer new_pos = new_pos_layer scale_ind = scale_ind_layer sample_pos = sample_pos_layer backbone_feat = backbone_feat_layer test_x = test_x_layer sample_scales = sample_scales_layer s = s_layer target_dist = z_dist sample_coords = sample_coords_layer final_layer = layer # if max_score > 0.8: # self.layer_id = target_dist print('Choose %d meter ... ' % target_dist, flag, max_score) # Update position and scale if flag != 'not_found': if self.params.get('use_iou_net', True): update_scale_flag = self.params.get( 'update_scale_when_uncertain', True) or flag != 'uncertain' if self.params.get('use_classifier', True): self.update_state(new_pos) self.refine_target_box(backbone_feat, sample_pos[scale_ind, :], sample_scales[scale_ind], scale_ind, update_scale_flag) elif self.params.get('use_classifier', True): self.update_state(new_pos, sample_scales[scale_ind]) # ------- UPDATE ------- # update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.get('hard_negative_learning_rate', None) if hard_negative else None if update_flag and self.params.get('update_classifier', False): # Get train sample train_x = test_x[scale_ind:scale_ind + 1, ...] # Create target_box and label for spatial sample target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind, :], sample_scales[scale_ind]) # Update the classifier model self.update_classifier(train_x, target_box, learning_rate, s[scale_ind, ...]) # Set the pos of the tracker to iounet pos if self.params.get('use_iou_net', True) and flag != 'not_found' and hasattr( self, 'pos_iounet'): self.pos = self.pos_iounet.clone() if flag != 'not_found': score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() # Visualize and set debug info self.search_area_box = torch.cat( (sample_coords[scale_ind, [1, 0]], sample_coords[scale_ind, [3, 2]] - sample_coords[scale_ind, [1, 0]] - 1)) self.debug_info['flag' + self.id_str] = flag self.debug_info['max_score' + self.id_str] = max_score if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map' + self.id_str) self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) else: max_score = 0 final_layer = image # Compute output bounding box new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) if self.params.get('output_not_found_box', False) and flag == 'not_found': output_state = [-1, -1, -1, -1] else: output_state = new_state.tolist() target_depth = get_target_depth(image, output_state) self.layer_id = int(target_depth // 2000) out = { 'target_bbox': output_state, 'confidence': max_score, 'image': torch_to_numpy(final_layer) } return out
def track(self, image): self.frame_num += 1 # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores scores_raw = self.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) # Save response(added window) for speed adjust search region self.response = s[scale_ind] # Update position and scale if flag != 'not_found': if self.use_iou_net: update_scale_flag = getattr(self.params, 'update_scale_when_uncertain', True) or flag != 'uncertain' if getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec) self.refine_target_box(sample_pos, sample_scales[scale_ind], scale_ind, update_scale_flag) elif getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec, sample_scales[scale_ind]) if self.params.debug >= 2: show_tensor(s[scale_ind,...], 5, title='Max score = {:.2f}'.format(torch.max(s[scale_ind,...]).item())) # ------- UPDATE ------- # # Check flags and set learning rate if hard negative update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.hard_negative_learning_rate if hard_negative else None if update_flag: # Get train sample train_x = TensorList([x[scale_ind:scale_ind+1, ...] for x in test_x]) # Create label for sample train_y = self.get_label_function(sample_pos, sample_scales[scale_ind]) # Update memory self.update_memory(train_x, train_y, learning_rate) # Train filter if hard_negative: self.filter_optimizer.run(self.params.hard_negative_CG_iter) elif (self.frame_num-1) % self.params.train_skipping == 0: self.filter_optimizer.run(self.params.CG_iter) # Set the pos of the tracker to iounet pos if self.use_iou_net and flag != 'not_found': self.pos = self.pos_iounet.clone() # Return new state new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) return new_state.tolist()
def track(self, image) -> dict: # print('track',torch.rand(2)) self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores scores_raw = self.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) # Update position and scale if flag != 'not_found': if self.use_iou_net: update_scale_flag = getattr(self.params, 'update_scale_when_uncertain', True) or flag != 'uncertain' if getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec) self.refine_target_box(sample_pos, sample_scales[scale_ind], scale_ind, update_scale_flag) elif getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec, sample_scales[scale_ind]) score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() self.debug_info['max_score'] = max_score self.debug_info['flag'] = flag if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map') self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # metricnet if self.use_iou_net and flag != 'not_found': pos_tmp = self.pos_iounet.clone() else: pos_tmp = self.pos state_tmp = torch.cat( (pos_tmp[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) state_tmp = state_tmp.numpy() with torch.no_grad(): current_target_metric_feature0 = get_target_feature( self.metric_model, state_tmp, np.array(image)) current_target_metric_feature = current_target_metric_feature0.cpu( ).detach().numpy() # success, target_dist = judge_success(self.metric_model, current_target_metric_feature, # self.target_metric_feature, self.params) lof_score, success = lof(current_target_metric_feature, self.clf, k=5, thresh=self.lof_thresh) if self.frame_num <= self.params.train_skipping: self.lof_thresh = (self.lof_thresh * (self.frame_num - 2) + lof_score * self.params.lof_rate) / (self.frame_num - 1) if self.frame_num == self.params.train_skipping: print(self.frame_num, lof_score, self.lof_thresh) # print(self.frame_num, ': lof:', lof_score, ' ', success) # if success: # for ii in range(len(self.target_features_all)-1,-1,-1): # dist = torch.norm(self.target_features_all[ii] - current_target_metric_feature0 , 2, dim=1).view(-1) # if dist<self.similar: # success=0 # continue # if success: # self.target_features_all.append(current_target_metric_feature0) # ------- UPDATE ------- # # Check flags and set learning rate if hard negative update_flag = flag not in ['not_found', 'uncertain'] if self.frame_num > self.params.train_skipping: update_flag = update_flag and success hard_negative = (flag == 'hard_negative') learning_rate = self.params.hard_negative_learning_rate if hard_negative else None if update_flag: # Get train sample train_x = TensorList( [x[scale_ind:scale_ind + 1, ...] for x in test_x]) # Create label for sample train_y = self.get_label_function(sample_pos, sample_scales[scale_ind]) # Update memory self.update_memory(train_x, train_y, learning_rate) # Train filter if hard_negative: self.filter_optimizer.run(self.params.hard_negative_CG_iter) elif (self.frame_num - 1) % self.params.train_skipping == 0: self.filter_optimizer.run(self.params.CG_iter) # Set the pos of the tracker to iounet pos if self.use_iou_net and flag != 'not_found': self.pos = self.pos_iounet.clone() # Return new state new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) out = {'target_bbox': new_state.tolist()} return out
def initialize(self, image, info: dict) -> dict: # Learn the initial target model. Initialize memory etc. self.frame_num = 1 if not self.params.has('device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize network self.initialize_features() # The segmentation network self.net = self.params.net # Convert image im = numpy_to_torch(image) # Time initialization tic = time.time() # Output out = {} # Get target position and size state = info['init_bbox'] init_mask = info.get('init_mask', None) if init_mask is not None and not self.params.get( 'init_with_box', False): # shape 1 , 1, h, w (frames, seq, h, w) init_mask = torch.tensor(init_mask).unsqueeze(0).unsqueeze( 0).float() elif hasattr(self.net, 'box_label_encoder'): # Generate initial mask from the box with torch.no_grad(): init_backbone_feat = self.net.extract_backbone(im) init_feat_clf = self.net.extract_target_model_features( init_backbone_feat) init_box = torch.tensor(state).unsqueeze(dim=0).to( init_feat_clf.device) init_mask_enc = self.net.box_label_encoder( init_box, init_feat_clf, im.shape[-2:]) if isinstance(init_mask_enc, (list, tuple)): init_mask_enc = init_mask_enc[0] init_mask_raw, _ = self.net.decoder(init_mask_enc, init_backbone_feat, im.shape[-2:]) init_mask = torch.sigmoid(init_mask_raw) out['init_mask'] = init_mask.squeeze().cpu() out['segmentation_raw'] = init_mask_raw.squeeze().cpu().numpy() out['segmentation'] = init_mask.squeeze().cpu().numpy() else: raise Exception('No mask provided') # Set target center and target size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Get object ids self.object_id = info.get('object_ids', [None])[0] self.id_str = '' if self.object_id is None else ' {}'.format( self.object_id) # Set sizes sz = self.params.image_sample_size self.img_sample_sz = torch.Tensor( [sz, sz] if isinstance(sz, int) else sz) self.img_support_sz = self.img_sample_sz # Set search area. search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() self.target_scale = math.sqrt( search_area) / self.img_sample_sz.prod().sqrt() # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Extract and transform sample init_backbone_feat, init_masks = self.generate_init_samples( im, init_mask) # Initialize target model self.init_target_model(init_backbone_feat, init_masks) out['time'] = time.time() - tic return out
def initialize(self, image1, image2, state, *args, **kwargs): # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features() # Check if image is color self.params.features.set_is_color(image1.shape[2] == 3) self.params.features.set_is_color(image2.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') self.time = 0 tic = time.time() # Get position and size self.pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() self.target_scale = math.sqrt(search_area) / self.params.image_sample_size # Check if IoUNet is used self.use_iou_net = getattr(self.params, 'use_iou_net', True) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Set sizes self.img_sample_sz = torch.Tensor([self.params.image_sample_size, self.params.image_sample_size]) self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) if getattr(self.params, 'score_upsample_factor', None) is None: self.output_sz = self.feature_sz[0] else: self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.kernel_size = self.fparams.attribute('kernel_size') self.iou_img_sample_sz = self.img_sample_sz self.params.score_fusion_strategy = getattr(self.params, 'score_fusion_strategy', 'default') self.output_window = None if getattr(self.params, 'window_output', False): if getattr(self.params, 'use_clipped_window', False): self.output_window = dcf.hann2d_clipped(self.output_sz.long(), self.output_sz.long()*self.params.effective_search_area / self.params.search_area_scale, centered=False).to(self.params.device) else: self.output_window = dcf.hann2d(self.output_sz.long(), centered=True).to(self.params.device) self.output_window = self.output_window.squeeze(0) # Convert image im1 = numpy_to_torch(image1) im2 = numpy_to_torch(image2) #self.im = im # Setup bounds self.image_sz = torch.Tensor([im1.shape[2], im1.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x1 = self.generate_init_samples(im1) x2 = self.generate_init_samples(im2) x = TensorList([torch.cat((v,i),1) for v, i in zip(x1, x2)]) self.init_classifier(x) if self.use_iou_net: self.init_iou_net() # Init memory # self.init_memory(x) self.time += time.time() - tic
def track(self, image1, image2): self.frame_num += 1 # Convert image im1 = numpy_to_torch(image1) im2 = numpy_to_torch(image2) # self.im = im # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_x1 = self.extract_sample(im1, self.pos, sample_scales, self.img_sample_sz) test_x2 = self.extract_sample(im2, self.pos, sample_scales, self.img_sample_sz) test_x = TensorList([torch.cat((v,i),1) for v, i in zip(test_x1, test_x2)]) # Compute scores scores_raw = self.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) # Update position and scale if flag != 'not_found': if self.use_iou_net: update_scale_flag = getattr(self.params, 'update_scale_when_uncertain', True) or flag != 'uncertain' if getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec) self.refine_target_box(sample_pos, sample_scales[scale_ind], scale_ind, update_scale_flag) elif getattr(self.params, 'use_classifier', True): self.update_state(sample_pos + translation_vec, sample_scales[scale_ind]) if self.params.debug >= 2: show_tensor(s[scale_ind,...], 5, title='Max score = {:.2f}'.format(torch.max(s[scale_ind,...]).item())) # ------- UPDATE ------- # update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = getattr(self.params, 'hard_negative_learning_rate', None) if hard_negative else None if getattr(self.params, 'update_classifier', False) and update_flag: # Get train sample train_x = TensorList([x[scale_ind:scale_ind+1, ...] for x in test_x]) # Create target_box and label for spatial sample target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos, sample_scales[scale_ind]) train_y = self.get_label_function(sample_pos, sample_scales[scale_ind]).to(self.params.device) # Update the classifier model self.update_classifier(train_x, train_y, target_box, learning_rate, s[scale_ind,...]) # Update memory # self.update_memory(train_x, train_y, learning_rate) # Set the pos of the tracker to iounet pos if self.use_iou_net and flag != 'not_found' and hasattr(self, 'pos_iounet'): self.pos = self.pos_iounet.clone() # Return new state new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) return new_state.tolist()
def initialize(self, image, state, *args, **kwargs): # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features() # Chack if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') # Get position and size self.pos = torch.Tensor([state[1] + (state[3] - 1)/2, state[0] + (state[2] - 1)/2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area self.target_scale = 1.0 search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() if search_area > self.params.max_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.max_image_sample_size) elif search_area < self.params.min_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.min_image_sample_size) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) self.img_sample_sz = torch.round(torch.sqrt(torch.prod(self.base_target_sz * self.params.search_area_scale))) * torch.ones(2) self.img_sample_sz += feat_max_stride - self.img_sample_sz % (2 * feat_max_stride) # Set other sizes (corresponds to ECO code) self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) self.filter_sz = self.feature_sz + (self.feature_sz + 1) % 2 self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.compressed_dim = self.fparams.attribute('compressed_dim') # Number of filters self.num_filters = len(self.filter_sz) # Get window function self.window = TensorList([dcf.hann2d(sz).to(self.params.device) for sz in self.feature_sz]) # Get interpolation function self.interp_fs = TensorList([dcf.get_interp_fourier(sz, self.params.interpolation_method, self.params.interpolation_bicubic_a, self.params.interpolation_centering, self.params.interpolation_windowing, self.params.device) for sz in self.filter_sz]) # Get regularization filter self.reg_filter = TensorList([dcf.get_reg_filter(self.img_support_sz, self.base_target_sz, fparams).to(self.params.device) for fparams in self.fparams]) self.reg_energy = self.reg_filter.view(-1) @ self.reg_filter.view(-1) # Get label function output_sigma_factor = self.fparams.attribute('output_sigma_factor') sigma = (self.filter_sz / self.img_support_sz) * torch.sqrt(self.base_target_sz.prod()) * output_sigma_factor self.yf = TensorList([dcf.label_function(sz, sig).to(self.params.device) for sz, sig in zip(self.filter_sz, sigma)]) # Optimization options self.params.precond_learning_rate = self.fparams.attribute('learning_rate') if self.params.CG_forgetting_rate is None or max(self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = (1 - max(self.params.precond_learning_rate))**self.params.CG_forgetting_rate # Convert image im = numpy_to_torch(image) # Setup bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x = self.generate_init_samples(im) # Initialize projection matrix x_mat = TensorList([e.permute(1,0,2,3).reshape(e.shape[1], -1).clone() for e in x]) x_mat -= x_mat.mean(dim=1, keepdim=True) cov_x = x_mat @ x_mat.t() self.projection_matrix = TensorList([torch.svd(C)[0][:,:cdim].clone() for C, cdim in zip(cov_x, self.compressed_dim)]) # Transform to get the training sample train_xf = self.preprocess_sample(x) # Shift the samples back if 'shift' in self.params.augmentation: for xf in train_xf: if xf.shape[0] == 1: continue for i, shift in enumerate(self.params.augmentation['shift']): shift_samp = 2 * math.pi * torch.Tensor(shift) / self.img_support_sz xf[1+i:2+i,...] = fourier.shift_fs(xf[1+i:2+i,...], shift=shift_samp) # Shift sample shift_samp = 2*math.pi * (self.pos - self.pos.round()) / (self.target_scale * self.img_support_sz) train_xf = fourier.shift_fs(train_xf, shift=shift_samp) # Initialize first-frame training samples num_init_samples = train_xf.size(0) self.init_sample_weights = TensorList([xf.new_ones(1) / xf.shape[0] for xf in train_xf]) self.init_training_samples = train_xf.permute(2, 3, 0, 1, 4) # Sample counters and weights self.num_stored_samples = num_init_samples self.previous_replace_ind = [None]*len(self.num_stored_samples) self.sample_weights = TensorList([xf.new_zeros(self.params.sample_memory_size) for xf in train_xf]) for sw, init_sw, num in zip(self.sample_weights, self.init_sample_weights, num_init_samples): sw[:num] = init_sw # Initialize memory self.training_samples = TensorList( [xf.new_zeros(xf.shape[2], xf.shape[3], self.params.sample_memory_size, cdim, 2) for xf, cdim in zip(train_xf, self.compressed_dim)]) # Initialize filter self.filter = TensorList( [xf.new_zeros(1, cdim, xf.shape[2], xf.shape[3], 2) for xf, cdim in zip(train_xf, self.compressed_dim)]) # Do joint optimization self.joint_problem = FactorizedConvProblem(self.init_training_samples, self.yf, self.reg_filter, self.projection_matrix, self.params, self.init_sample_weights) joint_var = self.filter.concat(self.projection_matrix) self.joint_optimizer = GaussNewtonCG(self.joint_problem, joint_var, debug=(self.params.debug>=3)) if self.params.update_projection_matrix: self.joint_optimizer.run(self.params.init_CG_iter // self.params.init_GN_iter, self.params.init_GN_iter) # Re-project samples with the new projection matrix compressed_samples = complex.mtimes(self.init_training_samples, self.projection_matrix) for train_samp, init_samp in zip(self.training_samples, compressed_samples): train_samp[:,:,:init_samp.shape[2],:,:] = init_samp # Initialize optimizer self.filter_optimizer = FilterOptim(self.params, self.reg_energy) self.filter_optimizer.register(self.filter, self.training_samples, self.yf, self.sample_weights, self.reg_filter) self.filter_optimizer.sample_energy = self.joint_problem.sample_energy self.filter_optimizer.residuals = self.joint_optimizer.residuals.clone() if not self.params.update_projection_matrix: self.filter_optimizer.run(self.params.init_CG_iter) # Post optimization self.filter_optimizer.run(self.params.post_init_CG_iter) self.symmetrize_filter()
def initialize(self, image, state, gt, *args, **kwargs): if len(gt) == 8: ww = gt[2] - gt[0] hh = gt[7] - gt[1] else: ww = gt[2] hh = gt[3] # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' if ww < 25 and hh < 25: self.feature_sz = TensorList([torch.Tensor([28., 28.])]) self.output_layer = TensorList(['layer2']) else: self.feature_sz = TensorList([torch.Tensor([14., 14.])]) # self.output_layer = TensorList(['layer3']) self.output_layer = TensorList(['layer3']) # Initialize some stuff if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features(self.output_layer) # Check if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') self.time = 0 tic = time.time() # Get position and size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) if state[3] > 50 or state[2] > 50: self.target_sz = torch.Tensor( [state[3] - state[3] / 8, state[2] - state[2] / 4]) else: self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area self.target_scale = 1.0 search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() if search_area > self.params.max_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.max_image_sample_size) elif search_area < self.params.min_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.min_image_sample_size) # Check if IoUNet is used self.use_iou_net = getattr(self.params, 'use_iou_net', True) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) if getattr(self.params, 'search_area_shape', 'square') == 'square': self.img_sample_sz = torch.round( torch.sqrt( torch.prod(self.base_target_sz * self.params.search_area_scale))) * torch.ones(2) elif self.params.search_area_shape == 'initrect': self.img_sample_sz = torch.round(self.base_target_sz * self.params.search_area_scale) else: raise ValueError('Unknown search area shape') if self.params.feature_size_odd: self.img_sample_sz += feat_max_stride - self.img_sample_sz % ( 2 * feat_max_stride) else: self.img_sample_sz += feat_max_stride - ( self.img_sample_sz + feat_max_stride) % (2 * feat_max_stride) # Set sizes self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.kernel_size = self.fparams.attribute('kernel_size') self.iou_img_sample_sz = self.img_sample_sz # Optimization options self.params.precond_learning_rate = self.fparams.attribute( 'learning_rate') if self.params.CG_forgetting_rate is None or max( self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = ( 1 - max(self.params.precond_learning_rate) )**self.params.CG_forgetting_rate self.output_window = None if getattr(self.params, 'window_output', False): if getattr(self.params, 'use_clipped_window', False): self.output_window = dcf.hann2d_clipped( self.output_sz.long(), self.output_sz.long() * self.params.effective_search_area / self.params.search_area_scale, centered=False).to(self.params.device) else: self.output_window = dcf.hann2d(self.output_sz.long(), centered=False).to( self.params.device) # Initialize some learning things self.init_learning() # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # Setup scale bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x = self.generate_init_samples(im) # Initialize iounet if self.use_iou_net: self.init_iou_net() # Initialize projection matrix self.init_projection_matrix(x) # Transform to get the training sample train_x = self.preprocess_sample(x) # Generate label function init_y = self.init_label_function(train_x) # Init memory self.init_memory(train_x) # Init optimizer and do initial optimization self.init_optimization(train_x, init_y) self.pos_iounet = self.pos.clone() self.time += time.time() - tic self.pool1 = torch.nn.AdaptiveMaxPool2d((1, 224)) self.pool2 = torch.nn.AdaptiveMaxPool2d((224, 1))
def __call__(self, image, is_mask=False): if isinstance(image, torch.Tensor): return self.crop_to_output(numpy_to_torch(self(torch_to_numpy(image)))) else: return cv.warpAffine(image, self.transform_matrix, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores scores_raw = self.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) # Update position and scale if flag != 'not_found': if self.use_iou_net: update_scale_flag = self.params.get( 'update_scale_when_uncertain', True) or flag != 'uncertain' if self.params.get('use_classifier', True): self.update_state(sample_pos + translation_vec) self.refine_target_box(sample_pos, sample_scales[scale_ind], scale_ind, update_scale_flag) elif self.params.get('use_classifier', True): self.update_state(sample_pos + translation_vec, sample_scales[scale_ind]) score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() self.debug_info['max_score'] = max_score self.debug_info['flag'] = flag if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map') self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # ------- UPDATE ------- # # Check flags and set learning rate if hard negative update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.hard_negative_learning_rate if hard_negative else None if update_flag: # Get train sample train_x = TensorList( [x[scale_ind:scale_ind + 1, ...] for x in test_x]) # Create label for sample train_y = self.get_label_function(sample_pos, sample_scales[scale_ind]) # Update memory self.update_memory(train_x, train_y, learning_rate) # Train filter if hard_negative: self.filter_optimizer.run(self.params.hard_negative_CG_iter) elif (self.frame_num - 1) % self.params.train_skipping == 0: self.filter_optimizer.run(self.params.CG_iter) # Set the pos of the tracker to iounet pos if self.use_iou_net and flag != 'not_found': self.pos = self.pos_iounet.clone() # Return new state new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) out = {'target_bbox': new_state.tolist()} return out
def initialize(self, image, info: dict, gpu_device) -> dict: # Initialize some stuff self.frame_num = 1 self.params.device = 'cuda:{0}'.format( gpu_device) if self.params.use_gpu else 'cpu' # Convert image im = numpy_to_torch(image) self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) # Initialize features self.initialize_features(im) # Chack if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') # Get position and size self.points = TensorList( [torch.Tensor([p[0], p[1]]) for p in info['points']]) self.org_points = self.points.clone() self.target_sz = torch.Tensor( [info['target_sz'][0], info['target_sz'][1]]) # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) self.img_sample_sz = self.image_sz.clone() self.img_sample_sz += feat_max_stride - self.img_sample_sz % ( 2 * feat_max_stride) # Set other sizes (corresponds to ECO code) self.img_support_sz = self.img_sample_sz self.mid_point = self.img_support_sz // 2 self.feature_sz = self.params.features.size(self.img_sample_sz) self.filter_sz = self.feature_sz + (self.feature_sz + 1) % 2 self.output_sz = self.img_support_sz # Interpolated size of the output # Number of filters self.num_filters = len(self.filter_sz) # Get window function #self.window = TensorList([dcf.hann2d(sz).to(self.params.device) for sz in self.feature_sz]) self.window = TensorList([ torch.ones((1, 1, int(sz[0].item()), int(sz[1].item()))).to(self.params.device) for sz in self.feature_sz ]) #self.window = TensorList([dcf.tukey2d(sz).to(self.params.device) for sz in self.feature_sz]) # Get interpolation function self.interp_fs = TensorList([ dcf.get_interp_fourier(sz, self.params.interpolation_method, self.params.interpolation_bicubic_a, self.params.interpolation_centering, self.params.interpolation_windowing, self.params.device) for sz in self.filter_sz ]) # Get label function output_sigma_factor = self.fparams.attribute('output_sigma_factor') sigma = (self.filter_sz / self.img_support_sz) * torch.sqrt( self.target_sz.prod()) * output_sigma_factor yf_zero = TensorList([ dcf.label_function(sz, sig).to(self.params.device) for sz, sig in zip(self.filter_sz, sigma) ]) yf_zero = complex.complex(yf_zero) self.yf = TensorList() for p in self.points: shift_sample = 2 * math.pi * (self.mid_point - p) / self.img_support_sz self.yf.append( TensorList( [fourier.shift_fs(yfs, shift_sample) for yfs in yf_zero])) # Optimization options self.params.precond_learning_rate = self.fparams.attribute( 'learning_rate') if self.params.CG_forgetting_rate is None or max( self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = ( 1 - max(self.params.precond_learning_rate) )**self.params.CG_forgetting_rate # Extract and transform sample x = self.generate_init_samples(im).to(self.params.device) self.x = x # Transform to get the training sample train_xf = self.preprocess_sample(x) # Shift the samples back if 'shift' in self.params.augmentation: for xf in train_xf: if xf.shape[0] == 1: continue for i, shift in enumerate(self.params.augmentation['shift']): shift_samp = 2 * math.pi * torch.Tensor( shift) / self.img_support_sz xf[1 + i:2 + i, ...] = fourier.shift_fs(xf[1 + i:2 + i, ...], shift=shift_samp) # Initialize first-frame training samples num_init_samples = train_xf.size(0) self.init_training_samples = train_xf.permute(2, 3, 0, 1, 4) # Initialize memory # Initialize filter self.training_samples = TensorList([ xf.new_zeros(xf.shape[2], xf.shape[3], self.params.sample_memory_size, xf.shape[1], 2) for xf in train_xf ]) self.filters = TensorList([ TensorList([ xf.new_zeros(1, xf.shape[1], xf.shape[2], xf.shape[3], 2) for xf in train_xf ]) for i in range(len(self.points)) ]) self.init_sample_weights = TensorList( [xf.new_ones(1) / xf.shape[0] for xf in train_xf]) self.sample_weights = TensorList( [xf.new_zeros(self.params.sample_memory_size) for xf in train_xf]) for sw, init_sw, num in zip(self.sample_weights, self.init_sample_weights, num_init_samples): sw[:num] = init_sw # Get regularization filter self.reg_filter = TensorList([ dcf.get_reg_filter(self.img_support_sz, self.target_sz, fparams).to(self.params.device) for fparams in self.fparams ]) self.reg_energy = self.reg_filter.view(-1) @ self.reg_filter.view(-1) # Sample counters and weights self.num_stored_samples = num_init_samples self.previous_replace_ind = [None] * len(self.num_stored_samples) for train_samp, init_samp in zip(self.training_samples, self.init_training_samples): train_samp[:, :, :init_samp.shape[2], :, :] = init_samp sample_energy = complex.abs_sqr(self.training_samples).mean( dim=2, keepdim=True).permute(2, 3, 0, 1) # Do joint optimization for i in range(len(self.points)): print('{0}'.format(i), end=', ') ts = self.training_samples.clone() yf = self.yf[i] filters = self.filters[i] i_sw = self.init_sample_weights.clone() re = self.reg_energy.clone() sw = self.sample_weights.clone() rf = self.reg_filter.clone() filter_optimizer = FilterOptim(self.params, re) filter_optimizer.register(filters, ts, yf, sw, rf) filter_optimizer.sample_energy = sample_energy.clone() filter_optimizer.run(self.params.init_CG_iter) # Post optimization filter_optimizer.run(self.params.post_init_CG_iter) self.filters[i] = filter_optimizer.filter self.symmetrize_filter() print()
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Obtain the merged segmentation prediction for the previous frames. This is used to update the target model # and determine the search region for the current frame if self.object_id is None: prev_segmentation_prob_im = info['previous_output'][ 'segmentation_raw'] else: prev_segmentation_prob_im = info['previous_output'][ 'segmentation_raw'][self.object_id] prev_segmentation_prob_im = torch.from_numpy( prev_segmentation_prob_im).unsqueeze(0).unsqueeze(0).float() # ********************************************************************************** # # ------- Update the target model using merged masks from the previous frame ------- # # ********************************************************************************** # if self.frame_num > 2: # Crop the segmentation mask for the previous search area if self.params.get('update_target_model', True): prev_segmentation_prob_crop, _ = sample_patch( prev_segmentation_prob_im, self.prev_pos, self.prev_scale * self.img_sample_sz, self.img_sample_sz, mode=self.params.get('border_mode', 'replicate'), max_scale_change=self.params.get('patch_max_scale_change'), is_mask=True) # Update the target model self.update_target_model(self.prev_test_x, prev_segmentation_prob_crop.clone()) # ****************************************************************************************** # # -------- Estimate target box using the merged segmentation mask from prev. frame --------- # # --- The estimated target box is used to obtain the search region for the current frame --- # # ****************************************************************************************** # self.pos, self.target_sz = self.get_target_state( prev_segmentation_prob_im.squeeze()) new_target_scale = torch.sqrt(self.target_sz.prod() / self.base_target_sz.prod()) if self.params.get('max_scale_change') is not None: # Do not allow drastic scale change, as this might be caused due to occlusions or incorrect mask # prediction new_target_scale = self.clip_scale_change(new_target_scale) # Update target size and scale using the filtered target size self.target_scale = new_target_scale self.target_sz = self.base_target_sz * self.target_scale # ********************************************************************** # # ---------- Predict segmentation mask for the current frame ----------- # # ********************************************************************** # # Convert image im = numpy_to_torch(image) # Extract backbone features backbone_feat, sample_coords, im_patches = self.extract_backbone_features( im, self.get_centered_sample_pos(), self.target_scale, self.img_sample_sz) # Save the search region information as it is needed to merge the segmentation masks for the next frame update self.prev_pos = self.get_centered_sample_pos() self.prev_scale = self.target_scale # Extract features input to the target model test_x = self.get_target_model_features(backbone_feat) # Location of sample sample_pos, sample_scale = self.get_sample_location(sample_coords) # Predict the segmentation mask. Note: These are raw scores, before the sigmoid segmentation_scores = self.segment_target(test_x, backbone_feat) self.prev_test_x = test_x # Get the segmentation scores for the full image. # Regions outside the search region are assigned low scores (-100) segmentation_scores_im = self.convert_scores_crop_to_image( segmentation_scores, im, sample_scale, sample_pos) segmentation_mask_im = (segmentation_scores_im > 0.0).float() # Binary segmentation mask segmentation_prob_im = torch.sigmoid( segmentation_scores_im ) # Probability of being target at each pixel # ************************************************************************ # # ---------- Output estimated segmentation mask and target box ----------- # # ************************************************************************ # # Get target box from the predicted segmentation pred_pos, pred_target_sz = self.get_target_state( segmentation_prob_im.squeeze()) new_state = torch.cat( (pred_pos[[1, 0]] - (pred_target_sz[[1, 0]] - 1) / 2, pred_target_sz[[1, 0]])) output_state = new_state.tolist() if self.object_id is None: # In single object mode, no merge called. Hence return the probabilities segmentation_output = segmentation_prob_im else: # In multi-object mode, return raw scores segmentation_output = segmentation_scores_im segmentation_mask_im = segmentation_mask_im.view( *segmentation_mask_im.shape[-2:]).cpu().numpy() segmentation_output = segmentation_output.cpu().numpy() if self.visdom is not None: self.visdom.register(segmentation_scores_im, 'heatmap', 2, 'Seg Scores' + self.id_str) self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') out = { 'segmentation': segmentation_mask_im, 'target_bbox': output_state, 'segmentation_raw': segmentation_output } return out
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) # ------- LOCALIZATION ------- # # Extract backbone features backbone_feat, sample_coords, im_patches = self.extract_backbone_features( im, self.get_centered_sample_pos(), self.target_scale * self.params.scale_factors, self.img_sample_sz) # Extract classification features test_x = self.get_classification_features(backbone_feat) # Location of sample sample_pos, sample_scales = self.get_sample_location(sample_coords) # Compute classification scores scores_raw = self.classify_target(test_x) # Localize the target translation_vec, scale_ind, s, flag = self.localize_target( scores_raw, sample_pos, sample_scales) new_pos = sample_pos[scale_ind, :] + translation_vec # Update position and scale if flag != 'not_found': if self.params.get('use_iou_net', True): update_scale_flag = self.params.get( 'update_scale_when_uncertain', True) or flag != 'uncertain' if self.params.get('use_classifier', True): self.update_state(new_pos) self.refine_target_box(backbone_feat, sample_pos[scale_ind, :], sample_scales[scale_ind], scale_ind, update_scale_flag) elif self.params.get('use_classifier', True): self.update_state(new_pos, sample_scales[scale_ind]) # ------- UPDATE ------- # update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.get('hard_negative_learning_rate', None) if hard_negative else None if update_flag and self.params.get('update_classifier', False): # Get train sample train_x = test_x[scale_ind:scale_ind + 1, ...] # Create target_box and label for spatial sample target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind, :], sample_scales[scale_ind]) # Update the classifier model self.update_classifier(train_x, target_box, learning_rate, s[scale_ind, ...]) # Set the pos of the tracker to iounet pos if self.params.get('use_iou_net', True) and flag != 'not_found' and hasattr( self, 'pos_iounet'): self.pos = self.pos_iounet.clone() score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() # Visualize and set debug info self.search_area_box = torch.cat( (sample_coords[scale_ind, [1, 0]], sample_coords[scale_ind, [3, 2]] - sample_coords[scale_ind, [1, 0]] - 1)) self.debug_info['flag' + self.id_str] = flag self.debug_info['max_score' + self.id_str] = max_score if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map' + self.id_str) self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # Compute output bounding box new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) if self.params.get('output_not_found_box', False) and flag == 'not_found': output_state = [-1, -1, -1, -1] else: output_state = new_state.tolist() out = {'target_bbox': output_state} return out
def initialize(self, image, info: dict) -> dict: state = info['init_bbox'] # Initialize some stuff self.frame_num = 1 if not self.params.has('device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features() # Check if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') tic = time.time() # Get position and size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area self.target_scale = 1.0 search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() if search_area > self.params.max_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.max_image_sample_size) elif search_area < self.params.min_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.min_image_sample_size) # Check if IoUNet is used self.use_iou_net = self.params.get('use_iou_net', True) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) if self.params.get('search_area_shape', 'square') == 'square': self.img_sample_sz = torch.round( torch.sqrt( torch.prod(self.base_target_sz * self.params.search_area_scale))) * torch.ones(2) elif self.params.search_area_shape == 'initrect': self.img_sample_sz = torch.round(self.base_target_sz * self.params.search_area_scale) else: raise ValueError('Unknown search area shape') if self.params.feature_size_odd: self.img_sample_sz += feat_max_stride - self.img_sample_sz % ( 2 * feat_max_stride) else: self.img_sample_sz += feat_max_stride - ( self.img_sample_sz + feat_max_stride) % (2 * feat_max_stride) # Set sizes self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.kernel_size = self.fparams.attribute('kernel_size') self.iou_img_sample_sz = self.img_sample_sz # Optimization options self.params.precond_learning_rate = self.fparams.attribute( 'learning_rate') if self.params.CG_forgetting_rate is None or max( self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = ( 1 - max(self.params.precond_learning_rate) )**self.params.CG_forgetting_rate self.output_window = None if self.params.get('window_output', False): if self.params.get('use_clipped_window', False): self.output_window = dcf.hann2d_clipped( self.output_sz.long(), self.output_sz.long() * self.params.effective_search_area / self.params.search_area_scale, centered=False).to(self.params.device) else: self.output_window = dcf.hann2d(self.output_sz.long(), centered=False).to( self.params.device) # Initialize some learning things self.init_learning() # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # Setup scale bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x = self.generate_init_samples(im) # Initialize iounet if self.use_iou_net: self.init_iou_net() # Initialize projection matrix self.init_projection_matrix(x) # Transform to get the training sample train_x = self.preprocess_sample(x) # Generate label function init_y = self.init_label_function(train_x) # Init memory self.init_memory(train_x) # Init optimizer and do initial optimization self.init_optimization(train_x, init_y) self.pos_iounet = self.pos.clone() out = {'time': time.time() - tic} return out
def track(self, image) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors test_xf = self.extract_fourier_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores sf = self.apply_filter(test_xf) translation_vec, scale_ind, s = self.localize_target(sf) scale_change_factor = self.params.scale_factors[scale_ind] # Update position and scale self.update_state(sample_pos + translation_vec, self.target_scale * scale_change_factor) score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() self.debug_info['max_score'] = max_score if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map') self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # if self.params.debug >= 3: # for i, hf in enumerate(self.filter): # show_tensor(fourier.sample_fs(hf).abs().mean(1), 6+i) # metric state_tmp = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) state_tmp = state_tmp.numpy() with torch.no_grad(): self.current_target_metric_feature.append( get_target_feature(self.metric_model, state_tmp, np.array(image)).cpu().detach().numpy()) # self.iou.append(overlap_ratio(state_tmp,self.ground_truth_rect[self.frame_num-1])) # success, target_dist = judge_success_no_class(self.metric_model, current_target_metric_feature,self.target_metric_feature, self.params) # lof_predict,success = lof(self.gt_pos_features, current_target_metric_feature.cpu().detach().numpy().reshape((1,1024)), k=5,thresh=5) # print(self.frame_num,': lof:',lof_predict[0],' ',success[0]) # ------- UPDATE ------- # # Get train sample train_xf = TensorList( [xf[scale_ind:scale_ind + 1, ...] for xf in test_xf]) # Shift the sample shift_samp = 2 * math.pi * (self.pos - sample_pos) / ( sample_scales[scale_ind] * self.img_support_sz) train_xf = fourier.shift_fs(train_xf, shift=shift_samp) self.train_xf.append(train_xf) if self.frame_num == 1: # Update memory self.update_memory(train_xf) # metricnet self.filter_optimizer.run(self.params.CG_iter, train_xf) self.symmetrize_filter() elif self.frame_num % self.params.train_skipping == 1: current_target_metric_feature = np.array( self.current_target_metric_feature).squeeze() current_target_metric_feature0 = torch.from_numpy( current_target_metric_feature).cuda() # lof_predict, success = lof(np.concatenate([self.gt_pos_features,current_target_metric_feature],axis=0), k=20,thresh=self.lof_thresh) lof_predict, success = lof(current_target_metric_feature, self.clf, k=5, thresh=self.lof_thresh) last_id = -1 if self.frame_num <= self.params.train_skipping + 1: self.lof_thresh = lof_predict.mean() * self.params.lof_rate print('lof_thresh:', self.lof_thresh) for ii in range(len(self.train_xf)): # print('lof:',lof_predict[ii],' iou:',self.iou[ii],success[ii]) if self.frame_num > self.params.train_skipping + 1 and success[ ii]: for kk in range(len(self.target_features_all) - 1, -1, -1): dist = torch.norm( self.target_features_all[kk] - current_target_metric_feature0[ii].reshape( [1, 1024]), 2, dim=1).view(-1) if dist < self.similar: success[ii] = 0 continue if self.frame_num <= self.params.train_skipping + 1 or success[ ii]: self.target_features_all.append( current_target_metric_feature0[ii].reshape([1, 1024])) last_id = ii self.update_memory(self.train_xf[ii]) if last_id > -1: self.filter_optimizer.run(self.params.CG_iter, self.train_xf[last_id]) self.symmetrize_filter() self.current_target_metric_feature = [] self.train_xf = [] # self.iou=[] # # Train filter # if self.frame_num % self.params.train_skipping == 1: # self.filter_optimizer.run(self.params.CG_iter, train_xf) # self.symmetrize_filter() # Return new state new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) out = {'target_bbox': new_state.tolist()} return out
def track(self, image) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) # ------- LOCALIZATION ------- # # print(['pos',self.pos]) # print(['get_centered_sample_pos',self.get_centered_sample_pos()]) # Extract backbone features backbone_feat, sample_coords = self.extract_backbone_features(im, self.get_centered_sample_pos(), self.target_scale * self.params.scale_factors,#1.82 self.img_sample_sz)#18*16 # Extract classification features test_x = self.get_classification_features(backbone_feat) # Location of sample sample_pos, sample_scales = self.get_sample_location(sample_coords) #print(sample_scales) # Compute classification scores scores_raw = self.classify_target(test_x) # Localize the target translation_vec, scale_ind, s, flag = self.localize_target(scores_raw, sample_scales) #print(['translation_vec', translation_vec]) new_pos = sample_pos[scale_ind,:] + translation_vec self.debug_info['flag'] = flag # print(['flag'],flag) # Update position and scale if flag != 'not_found': if getattr(self.params, 'use_iou_net', True): update_scale_flag = getattr(self.params, 'update_scale_when_uncertain', True) or flag != 'uncertain' if getattr(self.params, 'use_classifier', True): self.update_state(new_pos) self.refine_target_box(backbone_feat, sample_pos[scale_ind,:], sample_scales[scale_ind], scale_ind, update_scale_flag) elif getattr(self.params, 'use_classifier', True): self.update_state(new_pos, sample_scales[scale_ind]) # ------- UPDATE ------- # update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = getattr(self.params, 'hard_negative_learning_rate', None) if hard_negative else None if getattr(self.params, 'update_classifier', False) and update_flag: # Get train sample train_x = test_x[scale_ind:scale_ind+1, ...] # Create target_box and label for spatial sample target_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos[scale_ind,:], sample_scales[scale_ind]) # Update the classifier model self.update_classifier(train_x, target_box, learning_rate, s[scale_ind,...]) # Set the pos of the tracker to iounet pos if getattr(self.params, 'use_iou_net', True) and flag != 'not_found' and hasattr(self, 'pos_iounet'): self.pos = self.pos_iounet.clone() score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() self.debug_info['max_score'] = max_score if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map') self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: #print(['score_map has shape'], score_map.shape) show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # Compute output bounding box new_state = torch.cat((self.pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) if flag == 'not_found': #e.g. occluted, out of view out = {'target_bbox': [0, 0, 0, 0]} else: out = {'target_bbox': new_state.tolist()} return out
def initialize(self, image, info: dict) -> dict: initSeed = 1 torch.manual_seed(initSeed) torch.cuda.manual_seed(initSeed) torch.cuda.manual_seed_all(initSeed) np.random.seed(initSeed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True os.environ['PYTHONHASHSEED'] = str(initSeed) state = info['init_bbox'] # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features() # metricnet self.metric_model = model_load(self.params.metric_model_path) # warmup start with torch.no_grad(): tmp = np.random.rand(5, 3, 107, 107) tmp = torch.Tensor(tmp) tmp = (Variable(tmp)).type(torch.FloatTensor).cuda() tmp = self.metric_model(tmp) # warmup end self.target_metric_feature = get_target_feature( self.metric_model, np.array(state), np.array(image)) pos_generator = SampleGenerator( 'gaussian', np.array([image.shape[1], image.shape[0]]), 0.1, 1.3) gt_pos_examples = pos_generator( np.array(state).astype(np.int), 20, [0.7, 1]) gt_iou = 0.7 while gt_pos_examples.shape[0] == 0: gt_iou = gt_iou - 0.1 gt_pos_examples = pos_generator( np.array(state).astype(np.int), 20, [gt_iou, 1]) # print('gt-iou:', gt_iou) # self.gt_pos_features = get_anchor_feature(self.metric_model, np.array(image), gt_pos_examples).cpu().detach().numpy() with torch.no_grad(): gt_pos_features0 = get_anchor_feature(self.metric_model, np.array(image), gt_pos_examples) gt_pos_features = gt_pos_features0.cpu().detach().numpy() target_metric_feature = self.target_metric_feature.repeat( gt_pos_features.shape[0], 1) pos_all = torch.norm(gt_pos_features0 - target_metric_feature, 2, dim=1).view(-1) self.similar = pos_all.mean() * self.params.sim_rate print('similarThresh', self.similar) self.target_features_all = [] self.target_features_all.append(self.target_metric_feature) self.clf = lof_fit(gt_pos_features, k=5) # Chack if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') # Get position and size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area self.target_scale = 1.0 search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() if search_area > self.params.max_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.max_image_sample_size) elif search_area < self.params.min_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.min_image_sample_size) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) self.img_sample_sz = torch.round( torch.sqrt( torch.prod(self.base_target_sz * self.params.search_area_scale))) * torch.ones(2) self.img_sample_sz += feat_max_stride - self.img_sample_sz % ( 2 * feat_max_stride) # Set other sizes (corresponds to ECO code) self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) self.filter_sz = self.feature_sz + (self.feature_sz + 1) % 2 self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.compressed_dim = self.fparams.attribute('compressed_dim') # Number of filters self.num_filters = len(self.filter_sz) # Get window function self.window = TensorList( [dcf.hann2d(sz).to(self.params.device) for sz in self.feature_sz]) # Get interpolation function self.interp_fs = TensorList([ dcf.get_interp_fourier(sz, self.params.interpolation_method, self.params.interpolation_bicubic_a, self.params.interpolation_centering, self.params.interpolation_windowing, self.params.device) for sz in self.filter_sz ]) # Get regularization filter self.reg_filter = TensorList([ dcf.get_reg_filter(self.img_support_sz, self.base_target_sz, fparams).to(self.params.device) for fparams in self.fparams ]) self.reg_energy = self.reg_filter.view(-1) @ self.reg_filter.view(-1) # Get label function output_sigma_factor = self.fparams.attribute('output_sigma_factor') sigma = (self.filter_sz / self.img_support_sz) * torch.sqrt( self.base_target_sz.prod()) * output_sigma_factor self.yf = TensorList([ dcf.label_function(sz, sig).to(self.params.device) for sz, sig in zip(self.filter_sz, sigma) ]) # Optimization options self.params.precond_learning_rate = self.fparams.attribute( 'learning_rate') if self.params.CG_forgetting_rate is None or max( self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = ( 1 - max(self.params.precond_learning_rate) )**self.params.CG_forgetting_rate # Convert image im = numpy_to_torch(image) # Setup bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x = self.generate_init_samples(im) # Initialize projection matrix x_mat = TensorList( [e.permute(1, 0, 2, 3).reshape(e.shape[1], -1).clone() for e in x]) x_mat -= x_mat.mean(dim=1, keepdim=True) cov_x = x_mat @ x_mat.t() self.projection_matrix = TensorList([ torch.svd(C)[0][:, :cdim].clone() for C, cdim in zip(cov_x, self.compressed_dim) ]) # Transform to get the training sample train_xf = self.preprocess_sample(x) # Shift the samples back if 'shift' in self.params.augmentation: for xf in train_xf: if xf.shape[0] == 1: continue for i, shift in enumerate(self.params.augmentation['shift']): shift_samp = 2 * math.pi * torch.Tensor( shift) / self.img_support_sz xf[1 + i:2 + i, ...] = fourier.shift_fs(xf[1 + i:2 + i, ...], shift=shift_samp) # Shift sample shift_samp = 2 * math.pi * (self.pos - self.pos.round()) / ( self.target_scale * self.img_support_sz) train_xf = fourier.shift_fs(train_xf, shift=shift_samp) # Initialize first-frame training samples num_init_samples = train_xf.size(0) self.init_sample_weights = TensorList( [xf.new_ones(1) / xf.shape[0] for xf in train_xf]) self.init_training_samples = train_xf.permute(2, 3, 0, 1, 4) # Sample counters and weights self.num_stored_samples = num_init_samples self.previous_replace_ind = [None] * len(self.num_stored_samples) self.sample_weights = TensorList( [xf.new_zeros(self.params.sample_memory_size) for xf in train_xf]) for sw, init_sw, num in zip(self.sample_weights, self.init_sample_weights, num_init_samples): sw[:num] = init_sw # Initialize memory self.training_samples = TensorList([ xf.new_zeros(xf.shape[2], xf.shape[3], self.params.sample_memory_size, cdim, 2) for xf, cdim in zip(train_xf, self.compressed_dim) ]) # Initialize filter self.filter = TensorList([ xf.new_zeros(1, cdim, xf.shape[2], xf.shape[3], 2) for xf, cdim in zip(train_xf, self.compressed_dim) ]) # Do joint optimization self.joint_problem = FactorizedConvProblem(self.init_training_samples, self.yf, self.reg_filter, self.projection_matrix, self.params, self.init_sample_weights) joint_var = self.filter.concat(self.projection_matrix) self.joint_optimizer = GaussNewtonCG(self.joint_problem, joint_var, debug=(self.params.debug >= 1), visdom=self.visdom) if self.params.update_projection_matrix: self.joint_optimizer.run( self.params.init_CG_iter // self.params.init_GN_iter, self.params.init_GN_iter) # Re-project samples with the new projection matrix compressed_samples = complex.mtimes(self.init_training_samples, self.projection_matrix) for train_samp, init_samp in zip(self.training_samples, compressed_samples): train_samp[:, :, :init_samp.shape[2], :, :] = init_samp # Initialize optimizer self.filter_optimizer = FilterOptim(self.params, self.reg_energy) self.filter_optimizer.register(self.filter, self.training_samples, self.yf, self.sample_weights, self.reg_filter) self.filter_optimizer.sample_energy = self.joint_problem.sample_energy self.filter_optimizer.residuals = self.joint_optimizer.residuals.clone( ) if not self.params.update_projection_matrix: self.filter_optimizer.run(self.params.init_CG_iter) # Post optimization self.filter_optimizer.run(self.params.post_init_CG_iter) self.symmetrize_filter() # metricnet_lof self.current_target_metric_feature = [] self.train_xf = [] # self.iou=[] # self.lof_thresh=3.5 self.lof_thresh = self.params.lof_rate
def initialize(self, image, info: dict) -> dict: state = info['init_bbox'] # Initialize some stuff self.frame_num = 1 if not hasattr(self.params, 'device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' # Initialize features self.initialize_features() # metricnet self.metric_model = model_load(self.params.metric_model_path) # warmup start with torch.no_grad(): tmp = np.random.rand(5, 3, 107, 107) tmp = torch.Tensor(tmp) tmp = (Variable(tmp)).type(torch.FloatTensor).cuda() tmp = self.metric_model(tmp) # warmup end self.target_metric_feature = get_target_feature( self.metric_model, np.array(state), np.array(image)) pos_generator = SampleGenerator( 'gaussian', np.array([image.shape[1], image.shape[0]]), 0.1, 1.3) gt_pos_examples = pos_generator( np.array(state).astype(np.int), 20, [0.7, 1]) gt_iou = 0.7 while gt_pos_examples.shape[0] == 0: gt_iou = gt_iou - 0.1 gt_pos_examples = pos_generator( np.array(state).astype(np.int), 20, [gt_iou, 1]) print('gt-iou:', gt_iou) with torch.no_grad(): gt_pos_features0 = get_anchor_feature(self.metric_model, np.array(image), gt_pos_examples) gt_pos_features = gt_pos_features0.cpu().detach().numpy() # target_metric_feature = self.target_metric_feature.repeat(gt_pos_features.shape[0], 1) # pos_all = torch.norm(gt_pos_features0 - target_metric_feature, 2, dim=1).view(-1) # self.similar=pos_all.mean()*self.params.sim_rate # print('similarThresh',self.similar) self.clf = lof_fit(gt_pos_features, k=5) self.lof_thresh = 0 self.target_features_all = [] self.target_features_all.append(self.target_metric_feature) # Check if image is color self.params.features.set_is_color(image.shape[2] == 3) # Get feature specific params self.fparams = self.params.features.get_fparams('feature_params') tic = time.time() # Get position and size self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) # Set search area self.target_scale = 1.0 search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() if search_area > self.params.max_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.max_image_sample_size) elif search_area < self.params.min_image_sample_size: self.target_scale = math.sqrt(search_area / self.params.min_image_sample_size) # Check if IoUNet is used self.use_iou_net = getattr(self.params, 'use_iou_net', True) # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Use odd square search area and set sizes feat_max_stride = max(self.params.features.stride()) if getattr(self.params, 'search_area_shape', 'square') == 'square': self.img_sample_sz = torch.round( torch.sqrt( torch.prod(self.base_target_sz * self.params.search_area_scale))) * torch.ones(2) elif self.params.search_area_shape == 'initrect': self.img_sample_sz = torch.round(self.base_target_sz * self.params.search_area_scale) else: raise ValueError('Unknown search area shape') if self.params.feature_size_odd: self.img_sample_sz += feat_max_stride - self.img_sample_sz % ( 2 * feat_max_stride) else: self.img_sample_sz += feat_max_stride - ( self.img_sample_sz + feat_max_stride) % (2 * feat_max_stride) # Set sizes self.img_support_sz = self.img_sample_sz self.feature_sz = self.params.features.size(self.img_sample_sz) self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output self.kernel_size = self.fparams.attribute('kernel_size') self.iou_img_sample_sz = self.img_sample_sz # Optimization options self.params.precond_learning_rate = self.fparams.attribute( 'learning_rate') if self.params.CG_forgetting_rate is None or max( self.params.precond_learning_rate) >= 1: self.params.direction_forget_factor = 0 else: self.params.direction_forget_factor = ( 1 - max(self.params.precond_learning_rate) )**self.params.CG_forgetting_rate self.output_window = None if getattr(self.params, 'window_output', False): if getattr(self.params, 'use_clipped_window', False): self.output_window = dcf.hann2d_clipped( self.output_sz.long(), self.output_sz.long() * self.params.effective_search_area / self.params.search_area_scale, centered=False).to(self.params.device) else: self.output_window = dcf.hann2d(self.output_sz.long(), centered=False).to( self.params.device) # Initialize some learning things self.init_learning() # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # Setup scale bounds self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample x = self.generate_init_samples(im) # Initialize iounet if self.use_iou_net: self.init_iou_net() # Initialize projection matrix self.init_projection_matrix(x) # Transform to get the training sample train_x = self.preprocess_sample(x) # Generate label function init_y = self.init_label_function(train_x) # Init memory self.init_memory(train_x) # Init optimizer and do initial optimization self.init_optimization(train_x, init_y) self.pos_iounet = self.pos.clone() out = {'time': time.time() - tic} return out
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Convert image im = numpy_to_torch(image) self.im = im # For debugging only # ''' # Song : add the depth # ''' # if 'depth' in info.keys(): # depth = info['depth'] # depth = torch.from_numpy(np.asarray(depth, dtype=np.float32)).float() # else: # depth = None # ------- LOCALIZATION ------- # # Get sample sample_pos = self.pos.round() sample_scales = self.target_scale * self.params.scale_factors # if not depth: test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # else: # ''' # Song : edited extract_processed_sample # ''' # if 'depth_usage' in info.keys(): # depth_usage = info['depth_usage'] # else: # depth_usage = 'default' # # if depth_usage == 'hist_depth_mask': # test_x = self.extract_processed_sample_hist_depth_mask(im, depth, self.pos, sample_scales, self.img_sample_sz) # elif depth_usage == 'kmeans_depth_mask': # # Not implemented yet # test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # elif depth_usage == 'default': # test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # else: # ''' # nothing to do , just as the color inputs # ''' # test_x = self.extract_processed_sample(im, self.pos, sample_scales, self.img_sample_sz) # Compute scores scores_raw = self.apply_filter(test_x) translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) # Update position and scale if flag != 'not_found': if self.use_iou_net: update_scale_flag = self.params.get( 'update_scale_when_uncertain', True) or flag != 'uncertain' if self.params.get('use_classifier', True): self.update_state(sample_pos + translation_vec) self.refine_target_box(sample_pos, sample_scales[scale_ind], scale_ind, update_scale_flag) elif self.params.get('use_classifier', True): self.update_state(sample_pos + translation_vec, sample_scales[scale_ind]) score_map = s[scale_ind, ...] max_score = torch.max(score_map).item() self.debug_info['max_score'] = max_score self.debug_info['flag'] = flag if self.visdom is not None: self.visdom.register(score_map, 'heatmap', 2, 'Score Map') self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') elif self.params.debug >= 2: show_tensor(score_map, 5, title='Max score = {:.2f}'.format(max_score)) # # ''' # Song : should I do something when update_state ??? # ''' # ------- UPDATE ------- # # Check flags and set learning rate if hard negative update_flag = flag not in ['not_found', 'uncertain'] hard_negative = (flag == 'hard_negative') learning_rate = self.params.hard_negative_learning_rate if hard_negative else None if update_flag: # Get train sample train_x = TensorList( [x[scale_ind:scale_ind + 1, ...] for x in test_x]) # Create label for sample train_y = self.get_label_function(sample_pos, sample_scales[scale_ind]) # Update memory self.update_memory(train_x, train_y, learning_rate) # Train filter if hard_negative: self.filter_optimizer.run(self.params.hard_negative_CG_iter) elif (self.frame_num - 1) % self.params.train_skipping == 0: self.filter_optimizer.run(self.params.CG_iter) # Set the pos of the tracker to iounet pos if self.use_iou_net and flag != 'not_found': self.pos = self.pos_iounet.clone() # Return new state new_state = torch.cat( (self.pos[[1, 0]] - (self.target_sz[[1, 0]] - 1) / 2, self.target_sz[[1, 0]])) # out = {'target_bbox': new_state.tolist()} out = { 'target_bbox': new_state.tolist(), 'confidence': max_score } # , 'score_map': s.clone().cpu().numpy().squeeze()} # Song !!!!, as the confidence return out