def dimpnet50(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf'), frozen_backbone_layers=()): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained, frozen_layers=frozen_backbone_layers) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features if classification_layer == 'layer3': feature_dim = 256 elif classification_layer == 'layer4': feature_dim = 512 else: raise Exception clf_feature_extractor = clf_features.residual_bottleneck(feature_dim=feature_dim, num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear(filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN(num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) ### Transformer init_transformer = transformer.Transformer(d_model=512, nhead=1, num_layers=1) # The classifier module classifier = target_clf.LinearFilter(filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor, transformer=init_transformer) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4*128,4*256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # DiMP network net = DiMPnet(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def kysnet_res50(filter_size=4, optim_iter=3, appearance_feature_dim=512, optim_init_step=0.9, optim_init_reg=0.1, classification_layer='layer3', backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, final_conv=True, init_filter_norm=False, mask_init_factor=3.0, score_act='relu', target_mask_act='sigmoid', num_dist_bins=100, bin_displacement=0.1, detach_length=float('Inf'), train_feature_extractor=True, train_iounet=True, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), cv_kernel_size=3, cv_max_displacement=9, cv_stride=1, init_gauss_sigma=1.0, state_dim=8, representation_predictor_dims=(64, 32), gru_ksz=3, conf_measure='max', dimp_thresh=None): # ######################## backbone ######################## backbone_net = backbones.resnet50(pretrained=backbone_pretrained) norm_scale = math.sqrt( 1.0 / (appearance_feature_dim * filter_size * filter_size)) # ######################## classifier ######################## clf_feature_extractor = clf_features.residual_bottleneck( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=appearance_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear( filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=appearance_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=16, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=None, mask_act=target_mask_act, detach_length=detach_length) # The classifier module classifier = target_clf.LinearFilter( filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet(input_dim=(4 * 128, 4 * 256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) cost_volume_layer = cost_volume.CostVolume(cv_kernel_size, cv_max_displacement, stride=cv_stride, abs_coordinate_output=True) motion_response_predictor = resp_pred.ResponsePredictor( state_dim=state_dim, representation_predictor_dims=representation_predictor_dims, gru_ksz=gru_ksz, conf_measure=conf_measure, dimp_thresh=dimp_thresh) response_predictor = predictor_wrappers.PredictorWrapper( cost_volume_layer, motion_response_predictor) net = KYSNet(backbone_feature_extractor=backbone_net, dimp_classifier=classifier, predictor=response_predictor, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3'], train_feature_extractor=train_feature_extractor, train_iounet=train_iounet) return net
def dimpnet50(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf')): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features clf_feature_extractor = clf_features.residual_bottleneck( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear( settings=settings, filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN( settings=settings, num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) print( 'Song in ltr.models.tracking.DiMPnet_rgbd_blend1.py line 233, before classifier, target_clf.LinearFilter ...' ) # The classifier module classifier = target_clf.LinearFilter( settings=settings, filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor) # Bounding box regressor for rgb bb_regressor = bbmodels.AtomIoUNet(settings=settings, input_dim=(4 * 128, 4 * 256), pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) print( 'Song in ltr.models.tracking.DiMPnet_rgbd_blend1.py line 240, dimpnet50 model_constructor ...' ) # DiMP network net = DiMPnet_rgbd_blend1(settings=settings, feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def fcotnet(clf_filter_size=4, reg_filter_size=3, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=0, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=512, norm_scale_coef=2, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf'), train_cls_72_and_reg_init=True, train_reg_optimizer=False, train_cls_18=False): # Backbone backbone_net = backbones.resnet50(pretrained=backbone_pretrained) pyramid_first_conv = FPNUpBlock(res_channels=1024, planes=256, smooth_output=False, first_conv=True) up_36 = FPNUpBlock(res_channels=512, planes=256, smooth_output=False, first_conv=False) up_72 = FPNUpBlock(res_channels=256, planes=256, smooth_output=True, first_conv=False) # classifier_72 norm_scale_72 = math.sqrt(norm_scale_coef / (256 * clf_filter_size * clf_filter_size)) clf_head_72 = clf_features.clf_head_72(feature_dim=256, l2norm=clf_feat_norm, norm_scale=norm_scale_72, out_dim=256, inner_dim=128) initializer_72 = clf_initializer.FilterInitializerLinear( filter_size=clf_filter_size, filter_norm=init_filter_norm, feature_dim=256, feature_stride=4) optimizer_72 = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=4, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) classifier_72 = target_clf.LinearFilter(filter_size=clf_filter_size, filter_initializer=initializer_72, filter_optimizer=optimizer_72, feature_extractor=clf_head_72) # classifier_18 (We use the same architecture of classifier_18 with DiMP.) norm_scale_18 = math.sqrt( 1.0 / (out_feature_dim * clf_filter_size * clf_filter_size)) clf_head_18 = clf_features.clf_head_18(num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale_18, out_dim=out_feature_dim) initializer_18 = clf_initializer.FilterInitializerLinear( filter_size=clf_filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) optimizer_18 = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) classifier_18 = target_clf.LinearFilter(filter_size=clf_filter_size, filter_initializer=initializer_18, filter_optimizer=optimizer_18, feature_extractor=clf_head_18) # regressor_72 reg_optimizer_72 = reg_optimizer.RegSteepestDescentGN( num_iter=optim_iter, feat_stride=4, init_step_length=1.0, init_filter_reg=optim_init_reg, detach_length=detach_length) regressor_72 = RegFilter( pool_size=reg_filter_size, filter_dim=4, filter_channel=256, input_features_size=72, input_features_channel=256, inner_channel=128, filter_optimizer=reg_optimizer_72, train_reg_optimizer=train_reg_optimizer, train_cls_72_and_reg_init=train_cls_72_and_reg_init) # FCOT network net = FCOTNet(feature_extractor=backbone_net, classification_layer=classification_layer, pyramid_first_conv=pyramid_first_conv, pyramid_36=up_36, pyramid_72=up_72, classifier_18=classifier_18, classifier_72=classifier_72, regressor_72=regressor_72, train_reg_optimizer=train_reg_optimizer, train_cls_18=train_cls_18, train_cls_72_and_reg_init=train_cls_72_and_reg_init) return net
def dimpnet18(filter_size=1, optim_iter=5, optim_init_step=1.0, optim_init_reg=0.01, classification_layer='layer3', feat_stride=16, backbone_pretrained=True, clf_feat_blocks=1, clf_feat_norm=True, init_filter_norm=False, final_conv=True, out_feature_dim=256, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, iou_input_dim=(256, 256), iou_inter_dim=(256, 256), score_act='relu', act_param=None, target_mask_act='sigmoid', detach_length=float('Inf')): # Backbone backbone_net = backbones.resnet18(pretrained=backbone_pretrained) # Feature normalization norm_scale = math.sqrt(1.0 / (out_feature_dim * filter_size * filter_size)) # Classifier features clf_feature_extractor = clf_features.residual_basic_block( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) clf_feature_extractor_d = clf_features.residual_basic_block( num_blocks=clf_feat_blocks, l2norm=clf_feat_norm, final_conv=final_conv, norm_scale=norm_scale, out_dim=out_feature_dim) # Initializer for the DiMP classifier initializer = clf_initializer.FilterInitializerLinear_rgbd( filter_size=filter_size, filter_norm=init_filter_norm, feature_dim=out_feature_dim) # Optimizer for the DiMP classifier optimizer = clf_optimizer.DiMPSteepestDescentGN( num_iter=optim_iter, feat_stride=feat_stride, init_step_length=optim_init_step, init_filter_reg=optim_init_reg, init_gauss_sigma=init_gauss_sigma, num_dist_bins=num_dist_bins, bin_displacement=bin_displacement, mask_init_factor=mask_init_factor, score_act=score_act, act_param=act_param, mask_act=target_mask_act, detach_length=detach_length) # The classifier module classifier = target_clf_rgbd.LinearFilter( filter_size=filter_size, filter_initializer=initializer, filter_optimizer=optimizer, feature_extractor=clf_feature_extractor, feature_extractor_depth=clf_feature_extractor_d) # Bounding box regressor bb_regressor = bbmodels.AtomIoUNet_rgbd(pred_input_dim=iou_input_dim, pred_inter_dim=iou_inter_dim) # DiMP network net = DiMPnet_rgbd_cls(feature_extractor=backbone_net, classifier=classifier, bb_regressor=bb_regressor, classification_layer=classification_layer, bb_regressor_layer=['layer2', 'layer3']) return net
def initialize(self, image, gt_bbox, backbone, extract_classification_feat, info: dict) -> dict: # Initialize some stuff self.frame_num = 1 if not self.params.has('device'): self.params.device = 'cuda' if self.params.use_gpu else 'cpu' self.backbone = backbone self.extract_classification_feat = extract_classification_feat # Initializer for the DiMP classifier self.initializer = clf_initializer.FilterInitializerLinear( filter_size=1, filter_norm=False, feature_dim=512) #check cls out feature dim # Optimizer for the DiMP classifier self.optimizer = clf_optimizer.DiMPSteepestDescentGN( num_iter=5, feat_stride=8, init_step_length=1.0, init_filter_reg=0.01, init_gauss_sigma=1.0, num_dist_bins=5, bin_displacement=1.0, mask_init_factor=4.0, score_act='relu', act_param=None, mask_act='sigmoid', detach_length=float('Inf')) # The classifier module self.classifier = target_clf.LinearFilter( filter_size=1, filter_initializer=self.initializer, filter_optimizer=self.optimizer, feature_extractor=self.extract_classification_feat) # Time initialization tic = time.time() # Convert image im = numpy_to_torch(image) # Get target position and size state = gt_bbox self.pos = torch.Tensor( [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2]) self.target_sz = torch.Tensor([state[3], state[2]]) #hw # Set sizes self.image_sz = torch.Tensor([im.shape[2], im.shape[3]]) sz = self.params.image_sample_size sz = torch.Tensor([sz, sz] if isinstance(sz, int) else sz) if self.params.get('use_image_aspect_ratio', False): sz = self.image_sz * sz.prod().sqrt() / self.image_sz.prod().sqrt() stride = self.params.get('feature_stride', 8) sz = torch.round(sz / stride) * stride self.img_sample_sz = sz self.img_support_sz = self.img_sample_sz # Set search area search_area = torch.prod(self.target_sz * self.params.search_area_scale).item() self.target_scale = math.sqrt( search_area) / self.img_sample_sz.prod().sqrt() # Target size in base scale self.base_target_sz = self.target_sz / self.target_scale # Setup scale factors if not self.params.has('scale_factors'): self.params.scale_factors = torch.ones(1) elif isinstance(self.params.scale_factors, (list, tuple)): self.params.scale_factors = torch.Tensor(self.params.scale_factors) # Setup scale bounds self.min_scale_factor = torch.max(10 / self.base_target_sz) self.max_scale_factor = torch.min(self.image_sz / self.base_target_sz) # Extract and transform sample init_backbone_feat = self.generate_init_samples(im) # Initialize classifier self.init_classifier(init_backbone_feat) # Initialize IoUNet if self.params.get('use_iou_net', True): self.init_iou_net(init_backbone_feat) out = {'time': time.time() - tic} return out