def feature(self, im: np.array, target_pos, target_sz, avg_chans=None): """Extract feature Parameters ---------- im : np.array initial frame target_pos : target position (x, y) target_sz : [type] target size (w, h) avg_chans : [type], optional channel mean values, (B, G, R), by default None Returns ------- [type] [description] """ if avg_chans is None: avg_chans = np.mean(im, axis=(0, 1)) z_size = self._hyper_params['z_size'] context_amount = self._hyper_params['context_amount'] im_z_crop, _ = get_crop( im, target_pos, target_sz, z_size, avg_chans=avg_chans, context_amount=context_amount, func_get_subwindow=get_subwindow_tracking, ) phase = self._hyper_params['phase_init'] with torch.no_grad(): features = self._model(imarray_to_tensor(im_z_crop).to( self.device), phase=phase) return features, im_z_crop, avg_chans
def global_modeling(self): """ always runs after seg4vos, takes newly predicted filtered image, extracts high-level feature and updates the global feature based on confidence score """ filtered_image = self._state['filtered_image'] # shape: (129,129,3) with torch.no_grad(): deep_feature = self._segmenter( imarray_to_tensor(filtered_image).to(self.device), phase='global_feature')[0] seg_global_feature = self._state['seg_global_feature'] seg_init_feature = self._state['seg_init_feature'] u = self._hyper_params['seg_ema_u'] s = self._hyper_params['seg_ema_s'] conf_score = self._state['conf_score'] u = u * conf_score seg_global_feature = seg_global_feature * (1 - u) + deep_feature * u gml_feature = seg_global_feature * s + seg_init_feature * (1 - s) self._state['seg_global_feature'] = seg_global_feature self._state['gml_feature'] = gml_feature
def track(self, im_x, target_pos, target_sz, features, update_state=False, **kwargs): if 'avg_chans' in kwargs: avg_chans = kwargs['avg_chans'] else: avg_chans = self._state['avg_chans'] z_size = self._hyper_params['z_size'] x_size = self._hyper_params['x_size'] context_amount = self._hyper_params['context_amount'] phase_track = self._hyper_params['phase_track'] im_x_crop, scale_x = get_crop( im_x, target_pos, target_sz, z_size, x_size=x_size, avg_chans=avg_chans, context_amount=context_amount, func_get_subwindow=get_subwindow_tracking, ) # process batch of templates score_list = [] box_list = [] cls_list = [] ctr_list = [] fms_x = None for ith in range(self._hyper_params['mem_len']): if fms_x is None: with torch.no_grad(): score, box, cls, ctr, extra = self._model( imarray_to_tensor(im_x_crop).to(self.device), *(features[ith]), phase=phase_track) fms_x = [extra['c_x'], extra['r_x']] else: with torch.no_grad(): score, box, cls, ctr, extra = self._model( *(features[ith]), fms_x[0], fms_x[1], phase=phase_track) box = tensor_to_numpy(box[0]) score = tensor_to_numpy(score[0])[:, 0] cls = tensor_to_numpy(cls[0])[:, 0] ctr = tensor_to_numpy(ctr[0])[:, 0] # append to list box_list.append(box) score_list.append(score) cls_list.append(cls) ctr_list.append(ctr) # fusion if self._hyper_params['mem_len'] > 1: score = score_list[0] * (1-self._hyper_params['mem_coef']) + \ np.stack(score_list[1:], axis=0).mean(axis=0) * self._hyper_params['mem_coef'] else: # single template score = score_list[0] box = box_list[0] box_wh = xyxy2cxywh(box) # score post-processing best_pscore_id, pscore, penalty = self._postprocess_score( score, box_wh, target_sz, scale_x) # box post-processing new_target_pos, new_target_sz = self._postprocess_box( best_pscore_id, score, box_wh, target_pos, target_sz, scale_x, x_size, penalty) if self.debug: box = self._cvt_box_crop2frame(box_wh, target_pos, x_size, scale_x) # restrict new_target_pos & new_target_sz new_target_pos, new_target_sz = self._restrict_box( new_target_pos, new_target_sz) # record basic mid-level info self._state['x_crop'] = im_x_crop bbox_pred_in_crop = np.rint(box[best_pscore_id]).astype(np.int) self._state['bbox_pred_in_crop'] = bbox_pred_in_crop # record optional mid-level info if update_state: self._state['score'] = score self._state['pscore'] = pscore self._state['all_box'] = box self._state['cls'] = cls self._state['ctr'] = ctr return new_target_pos, new_target_sz
def track(self, im_x, target_pos, target_sz, features, update_state=False, **kwargs): if 'avg_chans' in kwargs: avg_chans = kwargs['avg_chans'] else: avg_chans = self._state['avg_chans'] z_size = self._hyper_params['z_size'] x_size = self._hyper_params['x_size'] context_amount = self._hyper_params['context_amount'] phase_track = self._hyper_params['phase_track'] im_x_crop, scale_x = get_crop( im_x, target_pos, target_sz, z_size, x_size=x_size, avg_chans=avg_chans, context_amount=context_amount, func_get_subwindow=get_subwindow_tracking, ) # store crop information self._state["crop_info"] = dict( target_pos=target_pos, target_sz=target_sz, scale_x=scale_x, avg_chans=avg_chans, ) with torch.no_grad(): score, box, cls, ctr, *args = self._model( imarray_to_tensor(im_x_crop).to(self.device), *features, phase=phase_track) box = tensor_to_numpy(box[0]) score = tensor_to_numpy(score[0])[:, 0] cls = tensor_to_numpy(cls[0]) ctr = tensor_to_numpy(ctr[0]) box_wh = xyxy2cxywh(box) # score post-processing best_pscore_id, pscore, penalty = self._postprocess_score( score, box_wh, target_sz, scale_x) # box post-processing new_target_pos, new_target_sz = self._postprocess_box( best_pscore_id, score, box_wh, target_pos, target_sz, scale_x, x_size, penalty) if self.debug: box = self._cvt_box_crop2frame(box_wh, target_pos, x_size, scale_x) # restrict new_target_pos & new_target_sz # new_target_pos, new_target_sz = self._restrict_box( # new_target_pos, new_target_sz) # record basic mid-level info self._state['x_crop'] = im_x_crop # bbox_pred_in_crop = np.rint(box[best_pscore_id]).astype(np.int) bbox_pred_in_crop = box[best_pscore_id] self._state['bbox_pred_in_crop'] = bbox_pred_in_crop self._state['bbox_pred_in_frame'] = bbox_pred_in_crop # record optional mid-level info if update_state: self._state['score'] = score self._state['pscore'] = pscore self._state['all_box'] = box self._state['cls'] = cls self._state['ctr'] = ctr return new_target_pos, new_target_sz
def joint_segmentation(self, im_x, target_pos, target_sz, corr_feature, gml_feature, **kwargs): r""" segment the current frame for VOS crop image => segmentation => params updation :param im_x: current image :param target_pos: target position (x, y) :param target_sz: target size (w, h) :param corr_feature: correlated feature produced by siamese encoder :param gml_feature: global feature produced by gloabl modeling loop :return: pred_mask mask prediction in the patch of saliency image :return: pred_mask_b binary mask prediction in the patch of saliency image """ if 'avg_chans' in kwargs: avg_chans = kwargs['avg_chans'] else: avg_chans = self._state['avg_chans'] # crop image for saliency encoder saliency_image, scale_seg = get_crop( im_x, target_pos, target_sz, z_size=self._hyper_params["z_size"], output_size=self._hyper_params["saliency_image_size"], x_size=self._hyper_params["saliency_image_field"], avg_chans=avg_chans, context_amount=self._hyper_params["context_amount"], func_get_subwindow=get_subwindow_tracking, ) self._state["scale_x"] = scale_seg # mask prediction pred_mask = self._segmenter(imarray_to_tensor(saliency_image).to( self.device), corr_feature, gml_feature, phase='segment')[0] #tensor(1,1,257,257) pred_mask = tensor_to_numpy(pred_mask[0]).transpose( (1, 2, 0)) #np (257,257,1) # post processing mask_filter = (pred_mask > self._hyper_params['mask_filter_thresh']).astype( np.uint8) pred_mask_b = (pred_mask > self._hyper_params['mask_pred_thresh']).astype(np.uint8) if self._hyper_params['save_patch']: mask_red = np.zeros_like(saliency_image) mask_red[:, :, 0] = mask_filter[:, :, 0] * 255 masked_image = saliency_image * 0.5 + mask_red * 0.5 self._state['patch_prediction'] = masked_image filtered_image = saliency_image * mask_filter filtered_image = cv2.resize(filtered_image, (self._hyper_params["GMP_image_size"], self._hyper_params["GMP_image_size"])) self._state['filtered_image'] = filtered_image if pred_mask_b.sum() > 0: conf_score = (pred_mask * pred_mask_b).sum() / pred_mask_b.sum() else: conf_score = 0 self._state['conf_score'] = conf_score mask_in_full_image = self._mask_back( pred_mask, size=self._hyper_params["saliency_image_size"], region=self._hyper_params["saliency_image_field"]) self._state['mask_in_full_image'] = mask_in_full_image if self._tracker.get_track_score( ) < self._hyper_params["track_failed_score_th"]: self._state['mask_in_full_image'] *= 0 return pred_mask, pred_mask_b
def init(self, im, state, init_mask): """ initialize the whole pipeline : tracker init => global modeling loop init :param im: init frame :param state: bbox in xywh format :param init_mask: binary mask of target object in shape (h,w) """ #========== SiamFC++ init ============== self._tracker.init(im, state) avg_chans = self._tracker.get_avg_chans() self._state['avg_chans'] = avg_chans rect = state # bbox in xywh format is given for initialization in case of tracking box = xywh2cxywh(rect) target_pos, target_sz = box[:2], box[2:] self._state['state'] = (target_pos, target_sz) self._state['im_h'] = im.shape[0] self._state['im_w'] = im.shape[1] # ========== Global Modeling Loop init ============== init_image, _ = get_crop( im, target_pos, target_sz, z_size=self._hyper_params["z_size"], x_size=self._hyper_params["GMP_image_size"], avg_chans=avg_chans, context_amount=self._hyper_params["context_amount"], func_get_subwindow=get_subwindow_tracking, ) init_mask_c3 = np.stack([init_mask, init_mask, init_mask], -1).astype(np.uint8) init_mask_crop_c3, _ = get_crop( init_mask_c3, target_pos, target_sz, z_size=self._hyper_params["z_size"], x_size=self._hyper_params["GMP_image_size"], avg_chans=avg_chans * 0, context_amount=self._hyper_params["context_amount"], func_get_subwindow=get_subwindow_tracking, ) init_mask_crop = init_mask_crop_c3[:, :, 0] init_mask_crop = (init_mask_crop > self._hyper_params['mask_filter_thresh']).astype( np.uint8) init_mask_crop = np.expand_dims(init_mask_crop, axis=-1) #shape: (129,129,1) filtered_image = init_mask_crop * init_image self._state['filtered_image'] = filtered_image #shape: (129,129,3) with torch.no_grad(): deep_feature = self._segmenter( imarray_to_tensor(filtered_image).to(self.device), phase='global_feature')[0] self._state['seg_init_feature'] = deep_feature #shape : (1,256,5,5) self._state['seg_global_feature'] = deep_feature self._state['gml_feature'] = deep_feature self._state['conf_score'] = 1