def localize_advanced(self, scores, sample_scales): """Run the target advanced localization (as in ATOM).""" sz = scores.shape[-2:] score_sz = torch.Tensor(list(sz)) score_center = (score_sz - 1)/2 scores_hn = scores if self.output_window is not None and getattr(self.params, 'perform_hn_without_windowing', False): scores_hn = scores.clone() scores *= self.output_window max_score1, max_disp1 = dcf.max2d(scores) _, scale_ind = torch.max(max_score1, dim=0) sample_scale = sample_scales[scale_ind] max_score1 = max_score1[scale_ind] max_disp1 = max_disp1[scale_ind,...].float().cpu().view(-1) target_disp1 = max_disp1 - score_center translation_vec1 = target_disp1 * (self.img_support_sz / self.feature_sz) * sample_scale if max_score1.item() < self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores_hn, 'not_found' # Mask out target neighborhood target_neigh_sz = self.params.target_neighborhood_scale * (self.target_sz / sample_scale) * (self.feature_sz / self.img_support_sz) tneigh_top = max(round(max_disp1[0].item() - target_neigh_sz[0].item() / 2), 0) tneigh_bottom = min(round(max_disp1[0].item() + target_neigh_sz[0].item() / 2 + 1), sz[0]) tneigh_left = max(round(max_disp1[1].item() - target_neigh_sz[1].item() / 2), 0) tneigh_right = min(round(max_disp1[1].item() + target_neigh_sz[1].item() / 2 + 1), sz[1]) scores_masked = scores_hn[scale_ind:scale_ind + 1, ...].clone() scores_masked[...,tneigh_top:tneigh_bottom,tneigh_left:tneigh_right] = 0 # Find new maximum max_score2, max_disp2 = dcf.max2d(scores_masked) max_disp2 = max_disp2.float().cpu().view(-1) target_disp2 = max_disp2 - score_center translation_vec2 = target_disp2 * (self.img_support_sz / self.feature_sz) * sample_scale # Handle the different cases if max_score2 > self.params.distractor_threshold * max_score1: disp_norm1 = torch.sqrt(torch.sum(target_disp1**2)) disp_norm2 = torch.sqrt(torch.sum(target_disp2**2)) disp_threshold = self.params.dispalcement_scale * math.sqrt(sz[0] * sz[1]) / 2 if disp_norm2 > disp_threshold and disp_norm1 < disp_threshold: return translation_vec1, scale_ind, scores_hn, 'hard_negative' if disp_norm2 < disp_threshold and disp_norm1 > disp_threshold: return translation_vec2, scale_ind, scores_hn, 'hard_negative' if disp_norm2 > disp_threshold and disp_norm1 > disp_threshold: return translation_vec1, scale_ind, scores_hn, 'uncertain' # If also the distractor is close, return with highest score return translation_vec1, scale_ind, scores_hn, 'uncertain' if max_score2 > self.params.hard_negative_threshold * max_score1 and max_score2 > self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores_hn, 'hard_negative' return translation_vec1, scale_ind, scores_hn, 'normal'
def localize_and_update_target(self, sf: TensorList, i): if self.params.score_fusion_strategy == 'weightedsum': weight = self.fparams.attribute('translation_weight') sf = fourier.sum_fs(weight * sf) scores = fourier.sample_fs(sf, self.output_sz) else: raise ValueError('Unknown score fusion strategy.') # Get maximum max_score, max_disp = dcf.max2d(scores) max_disp = max_disp.float().cpu() # Convert to displacements in the base scale if self.params.score_fusion_strategy in ['sum', 'weightedsum']: disp = (max_disp + self.output_sz / 2) % self.output_sz - self.output_sz / 2 elif self.params.score_fusion_strategy == 'transcale': disp = max_disp - self.output_sz / 2 # Compute translation vector and scale change factor translation_vec = disp.view(-1) * (self.img_support_sz / self.output_sz) # Update pos new_pos = self.mid_point.round() + translation_vec inside_ratio = 0.2 inside_offset = (inside_ratio - 0.5) * self.target_sz self.points[i] = torch.max( torch.min(new_pos, self.image_sz - inside_offset), inside_offset) return self.points[i].round(), max_score, scores
def localize_target(self, scores_raw): if self.params.score_fusion_strategy == 'weightedsum': weight = self.fparams.attribute('translation_weight') scores_raw = weight * scores_raw sf_weighted = fourier.cfft2(scores_raw) / (scores_raw.size(2) * scores_raw.size(3)) for i, (sz, ksz) in enumerate(zip(self.feature_sz, self.kernel_size)): sf_weighted[i] = fourier.shift_fs( sf_weighted[i], math.pi * (1 - torch.Tensor([ksz[0] % 2, ksz[1] % 2]) / sz)) scores_fs = fourier.sum_fs(sf_weighted) scores = fourier.sample_fs(scores_fs, self.output_sz) elif self.params.score_fusion_strategy == 'default': if len(scores_raw) > 1: raise NotImplementedError('Not implemented') scores = scores_raw[0] ksz = self.kernel_size[0] offset = torch.Tensor([ksz[0] % 2, ksz[1] % 2]) / 2 else: raise ValueError('Unknown score fusion strategy.') if self.output_window is not None and not getattr( self.params, 'perform_hn_without_windowing', False): raise NotImplementedError scores *= self.output_window if getattr(self.params, 'advanced_localization', False): return self.localize_advanced(scores) # Get maximum max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp.float().cpu() # Convert to displacements in the base scale if self.params.score_fusion_strategy == 'default': disp = max_disp + offset else: disp = (max_disp + self.output_sz / 2) % self.output_sz - self.output_sz / 2 # Compute translation vector and scale change factor translation_vec = disp[scale_ind, ...].view(-1) * ( self.img_support_sz / self.output_sz) * self.target_scale translation_vec *= self.params.scale_factors[scale_ind] # Shift the score output for visualization purposes if self.params.debug >= 2: sz = scores.shape[-2:] scores = torch.cat( [scores[..., sz[0] // 2:, :], scores[..., :sz[0] // 2, :]], -2) scores = torch.cat( [scores[..., :, sz[1] // 2:], scores[..., :, :sz[1] // 2]], -1) return translation_vec, scale_ind, scores, None
def localize_target(self, scores_raw): # Weighted sum (if multiple features) with interpolation in fourier domain weight = self.fparams.attribute('translation_weight', 1.0) #weight 没什么用 if (Debug): print("weight : ", weight) # scores_raw = weight * scores_raw # if (Debug): print("scores_raw: ", scores_raw) sf_weighted = fourier.cfft2(scores_raw) / (scores_raw.size(2) * scores_raw.size(3)) for i, (sz, ksz) in enumerate(zip(self.feature_sz, self.kernel_size)): # """Shift a sample a in the Fourier domain. sf_weighted[i] = fourier.shift_fs( sf_weighted[i], math.pi * (1 - torch.Tensor([ksz[0] % 2, ksz[1] % 2]) / sz)) #"""Sum a list of Fourier series expansions.""" scores_fs = fourier.sum_fs(sf_weighted) if (Debug): print("scores_fs : ", scores_fs) #"""Samples the Fourier series.""" scores = fourier.sample_fs(scores_fs, self.output_sz) if (Debug): print("scores: ", scores) if self.output_window is not None and not getattr( self.params, 'perform_hn_without_windowing', False): scores *= self.output_window if getattr(self.params, 'advanced_localization', False): if (Debug): print("advanced: ") return self.localize_advanced(scores) # Get maximum max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp.float().cpu() # Convert to displacements in the base scale disp = (max_disp + self.output_sz / 2) % self.output_sz - self.output_sz / 2 # Compute translation vector and scale change factor translation_vec = disp[scale_ind, ...].view(-1) * ( self.img_support_sz / self.output_sz) * self.target_scale translation_vec *= self.params.scale_factors[scale_ind] # Shift the score output for visualization purposes if self.params.debug >= 2: sz = scores.shape[-2:] scores = torch.cat( [scores[..., sz[0] // 2:, :], scores[..., :sz[0] // 2, :]], -2) scores = torch.cat( [scores[..., :, sz[1] // 2:], scores[..., :, :sz[1] // 2]], -1) return translation_vec, scale_ind, scores, None
def localize_target(self, sf: TensorList): if self.params.score_fusion_strategy == 'sum': scores = fourier.sample_fs(fourier.sum_fs(sf), self.output_sz) elif self.params.score_fusion_strategy == 'weightedsum': weight = self.fparams.attribute('translation_weight') scores = fourier.sample_fs(fourier.sum_fs(weight * sf), self.output_sz) elif self.params.score_fusion_strategy == 'transcale': alpha = self.fparams.attribute('scale_weight') beta = self.fparams.attribute('translation_weight') sample_sz = torch.round( self.output_sz.view(1, -1) * self.params.scale_factors.view(-1, 1)) scores = 0 for sfe, a, b in zip(sf, alpha, beta): sfe = fourier.shift_fs(sfe, math.pi * torch.ones(2)) scores_scales = [] for sind, sz in enumerate(sample_sz): pd = (self.output_sz - sz) / 2 scores_scales.append( F.pad(fourier.sample_fs(sfe[sind:sind + 1, ...], sz), (math.floor(pd[1].item()), math.ceil( pd[1].item()), math.floor( pd[0].item()), math.ceil(pd[0].item())))) scores_cat = torch.cat(scores_scales) scores = scores + (b - a) * scores_cat.mean( dim=0, keepdim=True) + a * scores_cat else: raise ValueError('Unknown score fusion strategy.') # Get maximum max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp.float().cpu() # Convert to displacements in the base scale if self.params.score_fusion_strategy in ['sum', 'weightedsum']: disp = (max_disp + self.output_sz / 2) % self.output_sz - self.output_sz / 2 elif self.params.score_fusion_strategy == 'transcale': disp = max_disp - self.output_sz / 2 # Compute translation vector and scale change factor translation_vec = disp[scale_ind, ...].view(-1) * ( self.img_support_sz / self.output_sz) * self.target_scale if self.params.score_fusion_strategy in ['sum', 'weightedsum']: translation_vec *= self.params.scale_factors[scale_ind] return translation_vec, scale_ind, scores
def localize_target_no_fourier(self, scores_raw): if getattr(self.params, 'advanced_localization', False): return self.localize_advanced_no_fourier(scores_raw[0]) max_score_r, max_disp_r = dcf.max2d(scores_raw[0]) max_disp_r = max_disp_r.float().cpu() # Convert to displacements in the base scale disp_r = max_disp_r * 16 - self.output_sz / 2 # Compute translation vector and scale change factor translation_vec_r = disp_r[0, ...].view(-1) * ( self.img_support_sz / self.output_sz) * self.target_scale translation_vec_r *= self.params.scale_factors[0] return translation_vec_r, 0, scores_raw, None
def localize_target(self, scores, sample_pos, sample_scales): """Run the target localization.""" scores = scores.squeeze(1) preprocess_method = self.params.get('score_preprocess', 'none') if preprocess_method == 'none': pass elif preprocess_method == 'exp': scores = scores.exp() elif preprocess_method == 'softmax': reg_val = getattr(self.net.classifier.filter_optimizer, 'softmax_reg', None) scores_view = scores.view(scores.shape[0], -1) scores_softmax = activation.softmax_reg(scores_view, dim=-1, reg=reg_val) scores = scores_softmax.view(scores.shape) else: raise Exception('Unknown score_preprocess in params.') score_filter_ksz = self.params.get('score_filter_ksz', 1) if score_filter_ksz > 1: assert score_filter_ksz % 2 == 1 kernel = scores.new_ones(1, 1, score_filter_ksz, score_filter_ksz) scores = F.conv2d(scores.view(-1, 1, *scores.shape[-2:]), kernel, padding=score_filter_ksz // 2).view(scores.shape) if self.params.get('advanced_localization', False): return self.localize_advanced(scores, sample_pos, sample_scales) # Get maximum score_sz = torch.Tensor(list(scores.shape[-2:])) score_center = (score_sz - 1) / 2 max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp[scale_ind, ...].float().cpu().view(-1) target_disp = max_disp - score_center # Compute translation vector and scale change factor output_sz = score_sz - (self.kernel_size + 1) % 2 translation_vec = target_disp * (self.img_support_sz / output_sz) * sample_scales[scale_ind] return translation_vec, scale_ind, scores, None
def localize_target(self, scores, sample_scales): """Run the target localization.""" scores = scores.squeeze(1) if getattr(self.params, 'advanced_localization', False): return self.localize_advanced(scores, sample_scales) # Get maximum score_sz = torch.Tensor(list(scores.shape[-2:])) score_center = (score_sz - 1)/2 max_score, max_disp = dcf.max2d(scores) _, scale_ind = torch.max(max_score, dim=0) max_disp = max_disp[scale_ind,...].float().cpu().view(-1) target_disp = max_disp - score_center # Compute translation vector and scale change factor translation_vec = target_disp * (self.img_support_sz / self.feature_sz) * sample_scales[scale_ind] return translation_vec, scale_ind, scores, None
def localize_advanced(self, scores): """Dows the advanced localization with hard negative detection and target not found.""" sz = scores.shape[-2:] if self.output_window is not None and getattr( self.params, 'perform_hn_without_windowing', False): scores_orig = scores.clone() scores_orig = torch.cat([ scores_orig[..., (sz[0] + 1) // 2:, :], scores_orig[..., :(sz[0] + 1) // 2, :] ], -2) scores_orig = torch.cat([ scores_orig[..., :, (sz[1] + 1) // 2:], scores_orig[..., :, :(sz[1] + 1) // 2] ], -1) scores *= self.output_window # Shift scores back scores = torch.cat([ scores[..., (sz[0] + 1) // 2:, :], scores[..., :(sz[0] + 1) // 2, :] ], -2) scores = torch.cat([ scores[..., :, (sz[1] + 1) // 2:], scores[..., :, :(sz[1] + 1) // 2] ], -1) # Find maximum max_score1, max_disp1 = dcf.max2d(scores) _, scale_ind = torch.max(max_score1, dim=0) max_score1 = max_score1[scale_ind] max_disp1 = max_disp1[scale_ind, ...].float().cpu().view(-1) target_disp1 = max_disp1 - self.output_sz // 2 translation_vec1 = target_disp1 * (self.img_support_sz / self.output_sz) * self.target_scale if max_score1.item() < self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores, 'not_found' if self.output_window is not None and getattr( self.params, 'perform_hn_without_windowing', False): scores = scores_orig # Mask out target neighborhood target_neigh_sz = self.params.target_neighborhood_scale * self.target_sz / self.target_scale tneigh_top = max( round(max_disp1[0].item() - target_neigh_sz[0].item() / 2), 0) tneigh_bottom = min( round(max_disp1[0].item() + target_neigh_sz[0].item() / 2 + 1), sz[0]) tneigh_left = max( round(max_disp1[1].item() - target_neigh_sz[1].item() / 2), 0) tneigh_right = min( round(max_disp1[1].item() + target_neigh_sz[1].item() / 2 + 1), sz[1]) scores_masked = scores[scale_ind:scale_ind + 1, ...].clone() scores_masked[..., tneigh_top:tneigh_bottom, tneigh_left:tneigh_right] = 0 # Find new maximum max_score2, max_disp2 = dcf.max2d(scores_masked) max_disp2 = max_disp2.float().cpu().view(-1) target_disp2 = max_disp2 - self.output_sz // 2 translation_vec2 = target_disp2 * (self.img_support_sz / self.output_sz) * self.target_scale # Handle the different cases if max_score2 > self.params.distractor_threshold * max_score1: disp_norm1 = torch.sqrt(torch.sum(target_disp1**2)) disp_norm2 = torch.sqrt(torch.sum(target_disp2**2)) disp_threshold = self.params.dispalcement_scale * math.sqrt( sz[0] * sz[1]) / 2 if disp_norm2 > disp_threshold and disp_norm1 < disp_threshold: return translation_vec1, scale_ind, scores, 'hard_negative' if disp_norm2 < disp_threshold and disp_norm1 > disp_threshold: return translation_vec2, scale_ind, scores, 'hard_negative' if disp_norm2 > disp_threshold and disp_norm1 > disp_threshold: return translation_vec1, scale_ind, scores, 'uncertain' # If also the distractor is close, return with highest score return translation_vec1, scale_ind, scores, 'uncertain' if max_score2 > self.params.hard_negative_threshold * max_score1 and max_score2 > self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores, 'hard_negative' return translation_vec1, scale_ind, scores, None
def localize_advanced(self, scores): sz = scores.shape[-2:] if self.output_window is not None and getattr(self.params, 'perform_hn_without_windowing', False): # raise NotImplementedError scores_orig = scores.clone() # scores_orig = torch.cat([scores_orig[..., (sz[0] + 1) // 2:, :], scores_orig[..., :(sz[0] + 1) // 2, :]], -2) # scores_orig = torch.cat([scores_orig[..., :, (sz[1] + 1) // 2:], scores_orig[..., :, :(sz[1] + 1) // 2]], -1) scores *= self.output_window if self.params.score_fusion_strategy == 'weightedsum': scores = torch.cat([scores[...,(sz[0]+1)//2:,:], scores[...,:(sz[0]+1)//2,:]], -2) scores = torch.cat([scores[...,:,(sz[1]+1)//2:], scores[...,:,:(sz[1]+1)//2]], -1) offset = torch.zeros(2) else: ksz = self.kernel_size[0] offset = torch.Tensor([ksz[0]%2, ksz[1]%2]) / 2 max_score1, max_disp1 = dcf.max2d(scores) _, scale_ind = torch.max(max_score1, dim=0) max_score1 = max_score1[scale_ind] max_disp1 = max_disp1[scale_ind,...].float().cpu().view(-1) target_disp1 = max_disp1 - self.output_sz // 2 translation_vec1 = target_disp1 * (self.img_support_sz / self.output_sz) * self.target_scale if max_score1.item() < self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores, 'not_found' if self.output_window is not None and getattr(self.params, 'perform_hn_without_windowing', False): scores = scores_orig # Mask out target neighborhood if getattr(self.params, 'use_hn_fix', False): target_neigh_sz = self.params.target_neighborhood_scale * (self.target_sz / self.target_scale) * (self.output_sz / self.img_support_sz) else: target_neigh_sz = self.params.target_neighborhood_scale * self.target_sz / self.target_scale tneigh_top = max(round(max_disp1[0].item() - target_neigh_sz[0].item() / 2), 0) tneigh_bottom = min(round(max_disp1[0].item() + target_neigh_sz[0].item() / 2 + 1), sz[0]) tneigh_left = max(round(max_disp1[1].item() - target_neigh_sz[1].item() / 2), 0) tneigh_right = min(round(max_disp1[1].item() + target_neigh_sz[1].item() / 2 + 1), sz[1]) scores_masked = scores[scale_ind:scale_ind+1,...].clone() scores_masked[...,tneigh_top:tneigh_bottom,tneigh_left:tneigh_right] = 0 # Find new maximum max_score2, max_disp2 = dcf.max2d(scores_masked) max_disp2 = max_disp2.float().cpu().view(-1) target_disp2 = max_disp2 - self.output_sz // 2 translation_vec2 = target_disp2 * (self.img_support_sz / self.output_sz) * self.target_scale if max_score2 > self.params.distractor_threshold * max_score1: disp_norm1 = torch.sqrt(torch.sum(target_disp1**2)) disp_norm2 = torch.sqrt(torch.sum(target_disp2**2)) disp_threshold = self.params.dispalcement_scale * math.sqrt(sz[0] * sz[1]) / 2 if disp_norm2 > disp_threshold and disp_norm1 < disp_threshold: return translation_vec1, scale_ind, scores, 'hard_negative' if disp_norm2 < disp_threshold and disp_norm1 > disp_threshold: return translation_vec2, scale_ind, scores, 'hard_negative' if disp_norm2 > disp_threshold and disp_norm1 > disp_threshold: return translation_vec1, scale_ind, scores, 'uncertain' # If also the distractor is close, return with highest score return translation_vec1, scale_ind, scores, 'uncertain' if max_score2 > self.params.hard_negative_threshold * max_score1 and max_score2 > self.params.target_not_found_threshold: return translation_vec1, scale_ind, scores, 'hard_negative' return translation_vec1, scale_ind, scores, None