def SiamRPN_track_upd(state, im, updatenet): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) # extract z template to update the z z_crop = Variable( get_subwindow_tracking(im, target_pos, p.exemplar_size, round(s_z), avg_chans).unsqueeze(0)) z_f = net.featextract(z_crop.cuda()) #z_f_ = (1-zLR) * Variable(state['z_f']).cuda() + zLR * z_f #temp = np.concatenate((init, pre, cur), axis=1) temp = torch.cat( (Variable(state['z_0']).cuda(), Variable(state['z_f']).cuda(), z_f), 1) init_inp = Variable(state['z_0']).cuda() z_f_ = updatenet(temp, init_inp) # print('updatenet input: ', temp.shape, init_inp.shape) # print('updatenet output: ', z_f_.shape) net.kernel(z_f_) state['z_f'] = z_f_.cpu().data state['net'] = net state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def SiamRPN_init_batch(exemplar_list, exemplar_cxy_list, net): train_config = dict() batch_size = len(exemplar_list) p = TrackerConfig() p.update(net.cfg) p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) avg_chans_list = [None for i in range(batch_size)] z_list = [None for i in range(batch_size)] z_large_list = [None for i in range(batch_size)] for batch in range(batch_size): target_pos = exemplar_cxy_list[batch][0] target_sz = exemplar_cxy_list[batch][1] avg_chans = np.mean(exemplar_list[batch], axis=(0, 1)) avg_chans_list[batch] = avg_chans wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # initialize the exemplar z_crop = get_subwindow_tracking(exemplar_list[batch], target_pos, p.exemplar_size, s_z, avg_chans) z_crop_large = get_subwindow_tracking(exemplar_list[batch], target_pos, p.instance_size, round(s_x), avg_chans) z = z_crop.unsqueeze(0) z_large = z_crop_large.unsqueeze(0) z_list[batch] = z z_large_list[batch] = z_large z_batch = z_list[0] z_large_batch = z_large_list[0] for idx in range(1, batch_size): z_batch = torch.cat((z_batch, z_list[idx]), dim=0) z_large_batch = torch.cat((z_large_batch, z_large_list[idx]), dim=0) assert z_batch.size(0)==batch_size assert z_large_batch.size(0)==batch_size net.temple(z_batch.cuda(), z_large_batch.cuda()) train_config['avg_chans_list'] = avg_chans_list train_config['p'] = p train_config['net'] = net return train_config
def SiamRPN_track(state, im): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] ctr = state['ctr'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score state['ctr'] = ctr+1 if ctr % 50 == 4: label = np.zeros(im.shape) x_low, x_high = target_pos[0] - target_sz[0]/2, target_pos[0] + target_sz[0]/2 y_low, y_high = target_pos[1] - target_sz[1]/2, target_pos[1] + target_sz[1]/2 x_low, x_high, y_low, y_high = int(x_low), int(x_high), int(y_low), int(y_high) label[y_low:y_high,x_low:x_high,:] = 2 label = (get_subwindow_tracking(label, target_pos, p.instance_size, round(s_x), 0, out_mode="image")) label = cv2.split(label)[0] if p.instance_size == 271: label = cv2.resize(label, (19, 19)) else: label = cv2.resize(label, (21, 21)) label = torch.Tensor([(2-label)]*5 + [label]*5).unsqueeze(0) net.make_at_small(x_crop.cuda(), label.cuda()) return state
def init(self, frame, bbox): """ initialize siamfc tracker Args: frame: an RGB image bbox: zero-based bounding box [x, y, width, height] """ self.pos = np.array([bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2]) # center x, center y self.target_sz = np.array([bbox[2], bbox[3]]) # width, height wc_z = self.target_sz[0] + 0.5 * sum(self.target_sz) hc_z = self.target_sz[1] + 0.5 * sum(self.target_sz) self.s_z = np.sqrt(wc_z * hc_z) self.s_x = self.s_z * config.instance_size / config.exemplar_size # get exemplar img img_mean = tuple(map(int, frame.mean(axis=(0, 1)))) exemplar_img = get_subwindow_tracking(frame, self.pos, config.exemplar_size, python2round(self.s_z), img_mean) exemplar_img = self.transforms(exemplar_img)[None, :, :, :] # get exemplar feature with torch.cuda.device(self.gpu_id): exemplar_img = Variable(exemplar_img.cuda(), requires_grad=False) self.model(exemplar_img, None) # create hanning window self.hann_window = np.outer(np.hanning(self.response_sz), np.hanning(self.response_sz)) self.hann_window = np.tile( self.hann_window.flatten(), len(config.anchor_ratios) * len(config.anchor_scales)) self.counter_re = 0
def SiamRPN_set_source_batch(train_config, source_list, source_cxy_list): p = train_config['p'] net = train_config['net'] avg_chans_list = train_config['avg_chans_list'] batch_size = len(source_list) x_batch = None for batch in range(batch_size): target_pos = source_cxy_list[batch][0] target_sz = source_cxy_list[batch][1] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z #the ratio between the in-model sizes and the real sizes d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = get_subwindow_tracking(source_list[batch], target_pos, p.instance_size, round(s_x), avg_chans_list[batch]).unsqueeze(0) if type(x_batch)!=torch.Tensor: x_batch = x_crop else: x_batch = torch.cat((x_batch, x_crop), dim=0) assert x_batch.size(0)==batch_size, '{}'.format(x_batch.size()) net(x_batch.cuda(), set_source = True)
def SiamRPN_track(state, im): # 接收网络参数 p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] # 主要接收上一帧目标跟踪的位置以及尺寸 target_pos = state['target_pos'] target_sz = state['target_sz'] # 更新搜索区域,决定本帧的搜索区域(根据上一帧的检测结果来设置本帧搜索区域) wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) # 获取尺度变化率 scale_z = p.exemplar_size / s_z # 调整搜索区域 d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) # 获得本帧预测结果,target_pos-目标位置 target_sz-目标尺度 score-置信分数 # Ps: target_sz * scale_z表示本帧的搜索区域大小 target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def SiamRPN_track_bbox(net, state, im, next_mask, conf_mask, index_1, index_2, frame_num, data_dir, gtbbox): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) target_pos, target_sz, score, alternative = tracker_eval_record_data( net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p, im, next_mask, conf_mask, index_1, index_2, frame_num, data_dir, gtbbox) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score state['fg'] = alternative return state
def SiamRPN_track(state, im): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def SiamRPN_init(im, target_pos, target_sz, net): state = dict() p = TrackerConfig() p.update(net.cfg) state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if p.adaptive: if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) avg_chans = np.mean(im, axis=(0, 1)) wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z_crop_large = get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans) z = z_crop.unsqueeze(0) # removed the Variable interface z_large = z_crop_large.unsqueeze(0) net.temple(z.cuda(), z_large.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state
def SiamRPN_init(im, target_pos, target_sz, net, net_name): state = dict() if 'SiamRPNPP' in net_name: p = TrackerConfig_SiamRPNPP() else: p = TrackerConfig() p.update(net.cfg) state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if p.adaptive: if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 255 # p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) avg_chans = np.mean(im, axis=(0, 1)) wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans, out_mode='np') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) z = Variable(transform(z_crop).unsqueeze(0)) net.temple(z.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state
def SiamRPN_init(im, target_pos, target_sz, net): """ SiamRPN_init:SiamRPN网络初始化 :param im: 跟踪的图片 :param target_pos: 目标的中心点 :param target_sz: 目标区域的宽高 :param net: 跟踪网络 """ state = dict() p = TrackerConfig() p.update(net.cfg) state['im_h'] = im.shape[0] # 图片的高度 state['im_w'] = im.shape[1] # 图片的宽度 if p.adaptive: # 根据目标和输入图像的大小调整搜索区域,比例小于0.4%,需要调大搜索区域 if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 # 根据网络总步长计算出得分图大小 p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 # generate_anchor:构造出以图像中心为原点,格式为[cx, cy, w, h]的锚点矩阵 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) # 求图片RGB三像素的行列均值,len(avg_chans)=3 avg_chans = np.mean(im, axis=(0, 1)) # wc_z和hc_z表示纹理填充后的宽高,s_z为等效边长 wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar # get_subwindow_tracking:填充并截取出目标 z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z = Variable(z_crop.unsqueeze(0)) # z.size=([1, 3, 127, 127]) net.temple(z.cuda()) # 运行 temple 函数计算模板结果 # 两种窗 if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state
def SiamRPN_init(im, target_pos, target_sz, net): state = dict() # 创建一个字典 p = TrackerConfig() # 初始化Tracker对象 p.update(net.cfg) # 为不同的net(model)作更新 state['im_h'] = im.shape[0] # 整幅图像的 高 state['im_w'] = im.shape[1] # 整幅图像的 宽 if p.adaptive: # 根据目标和输入图像的大小调整搜索区域 if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: # 目标面积占比小于 0.4% p.instance_size = 287 # small object big search region else: p.instance_size = 271 p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 # (271-127)/8 + 1 = 19 # 构造出以图像中心为原点,格式为[cx, cy, w, h]的锚点矩阵 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) # for i in range(p.anchor.shape[0]): # box = p.anchor[i] # cv2.rectangle(im, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 255), 0) # cv2.imshow('im', im) # cv2.waitKey(0) avg_chans = np.mean(im, axis=(0, 1)) # p.context_amount * sum(target_sz)为填充边界。wc_z和hc_z表示纹理填充后的宽高,s_z为等效边长。 wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # 202 # initialize the exemplar 填充并截取出目标 z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) # 包裹张量并记录应用于它的操作 z = Variable(z_crop.unsqueeze(0)) # 运行 temple 函数计算模板结果 net.temple(z.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz # -------------------------------------------------------------------------------------------------------------- state['s_z_original'] = s_z state['s_x'] = 0 # -------------------------------------------------------------------------------------------------------------- return state
def SiamRPN_init(im, target_pos, target_sz, net): # 设置一个空的字典 state = dict() # 设置跟踪器的相关参数 p = TrackerConfig() # 将网络的参数加载至跟踪模型中 p.update(net.cfg) # 将图像的尺寸加载至state中 state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if p.adaptive: # 初始化设置为True if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 # 用于设置instance_size,为score_size计算做准备 # 与TrackerConfig类中计算方式一致 p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 # 生成候选框尺寸锚点 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) # 计算图像均值 avg_chans = np.mean(im, axis=(0, 1)) # context_amount参数已经初始化,默认0.5, wc_z = target_sz[0] + p.context_amount * sum(target_sz) # target_sz是模板的横纵尺寸,通过sum函数加在一起 hc_z = target_sz[1] + p.context_amount * sum(target_sz) # round取整数 s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar 初始化目标模板 z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) # 数据降维 z = Variable(z_crop.unsqueeze(0)) # 将网络加载至GPU上运行 net.temple(z.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) # 将所有的参数写入字典中,进行保存 state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz # 返回网络的初始化设置 以字典形式返回 return state
def _pad_crop_resize_detection(self): wc_z = self.ret['detection_target_sz'][1] + self.ret['p'].context_amount * sum(self.ret['detection_target_sz']) hc_z = self.ret['detection_target_sz'][0] + self.ret['p'].context_amount * sum(self.ret['detection_target_sz']) s_z = np.sqrt(wc_z * hc_z) scale_z = self.ret['p'].exemplar_size / s_z d_search = (self.ret['p'].instance_size - self.ret['p'].exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad avg_chans = np.mean(self.ret['img_detection'], axis=(0, 1)) # 图像均值 # extract scaled crops for search region x at previous target position x_crop = Variable(get_subwindow_tracking(self.ret['img_detection'], self.ret['detection_target_pos'], self.ret['p'].instance_size, round(s_x), avg_chans).unsqueeze(0)) self.ret['detection'] = x_crop
def SiamRPN_track(state, im): # 从state中获取所需变量 p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] # v------------------------------------------------------------------------------------------------------------- s_z_original = state['s_z_original'] # ^------------------------------------------------------------------------------------------------------------- # 计算扩展后尺寸 context_amount = 0.5, exemplar_size = 127, instance_size = 271(287) wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) # v------------------------------------------------------------------------------------------------------------- # 缩放系数 zoom = s_z/s_z_original # ^------------------------------------------------------------------------------------------------------------- scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # 受 target_sz 大小影响 # extract scaled crops for search region x at previous target position # 在前一个目标位置为搜索区域x提取缩放的截图 x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) # tracker_eval 预测出新的位置和得分 .cuda()将内存中的数据复制到GPU显存中去 # target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p, im, zoom) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos # 返回跟踪框左上角坐标 state['target_sz'] = target_sz state['score'] = score # v------------------------------------------------------------------------------------------------------------- state['s_x'] = round(s_x) # <-----形变时的模板更新-----> print(score) # if score <= 0.8: # z_crop = Variable(get_subwindow_tracking(im, target_pos, p.exemplar_size, round(s_x), avg_chans).unsqueeze(0)) # net.temple(z_crop.cuda()) # ^------------------------------------------------------------------------------------------------------------- return state
def SiamRPN_init(im, target_pos, target_sz, net): state = dict() p = TrackerConfig() #p.zLR = zLR state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 p.score_size = int((p.instance_size - p.exemplar_size) / p.total_stride + 1) p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, p.score_size) avg_chans = np.mean(im, axis=(0, 1)) wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z = Variable(z_crop.unsqueeze(0)) z_f = net.featextract(z.cuda()) net.kernel(z_f) #net.temple(z.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz state['z_0'] = z_f.cpu().data state['z_f'] = z_f.cpu().data return state
def SiamRPN_init(im, target_pos, target_sz, net): state = dict() p = TrackerConfig() p.update(net.cfg) state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if p.adaptive: if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 255 # small object big search region else: p.instance_size = 255 p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size)) avg_chans = np.mean(im, axis=(0, 1)) wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z_crop = z_crop / 256 z_crop[0, :, :] = (z_crop[0, :, :] - 0.485) / 0.229 z_crop[1, :, :] = (z_crop[1, :, :] - 0.456) / 0.224 z_crop[2, :, :] = (z_crop[2, :, :] - 0.406) / 0.225 z = Variable(z_crop.unsqueeze(0)) net.temple(z.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state
def SiamRPN_track(state, im, z_crop, ids, name): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] #background bbox wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z #scale ratio of template d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position # 这里相当于目标的位置仍然在原位 # 然后以此中心截取 s_x 并且做缩放 # 这样做的缺点在于如果高速移动 就容易crop不到 x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) #print(x_crop.shape)#(1L, 3L, 271L, 271L) save_img = x_crop.data.squeeze(0).numpy().transpose( (1, 2, 0)).astype(np.int32) save_path = os.path.join('/home/ly/chz/srpn_tmp', name, '{:03d}_detection_input.jpg'.format(ids)) cv2.imwrite(save_path, save_img) #print('save detection input image @ {}'.format(save_path)) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), z_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p, ids, name, im) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def SiamRPN_train_batch(train_config, instance_list, source_cxy_list, instance_cxy_list): p = train_config['p'] net = train_config['net'] avg_chans_list = train_config['avg_chans_list'] batch_size = len(instance_list) x_batch = None shift = np.zeros([batch_size, 2]) gt_sz_list = np.zeros([batch_size, 2]) boxB = np.zeros([batch_size, 4]) for batch in range(batch_size): target_pos = source_cxy_list[batch][0] target_sz = source_cxy_list[batch][1] gt_pos = instance_cxy_list[batch][0] gt_sz = instance_cxy_list[batch][1] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) #scale_z transfer scale_z = p.exemplar_size / s_z #the ratio between the in-model sizes and the real sizes gt_sz = gt_sz*scale_z gt_sz_list[batch,:] = gt_sz target_sz = target_sz*scale_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = get_subwindow_tracking(instance_list[batch], target_pos, p.instance_size, round(s_x), avg_chans_list[batch]).unsqueeze(0) if type(x_batch)!=torch.Tensor: x_batch = x_crop else: x_batch = torch.cat((x_batch, x_crop), dim=0) shift[batch, :] = np.asarray([(gt_pos[0] - target_pos[0])*scale_z, (gt_pos[1] - target_pos[1])*scale_z], dtype=np.float32) boxB[batch, :] = np.asarray([shift[batch, 0]-gt_sz[0]/2, shift[batch, 1]-gt_sz[1]/2, \ shift[batch, 0]+gt_sz[0]/2, shift[batch, 1]+gt_sz[1]/2], dtype=np.float32) assert x_batch.size(0)==batch_size cls_loss, box_loss = tracker_train_batch(net, x_batch.cuda(), shift, boxB, gt_sz_list, p) return cls_loss, box_loss
def SiamRPN_set_source(state, im, source_pos, source_sz): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] target_pos = source_pos target_sz = source_sz wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z #the ratio between the in-model sizes and the real sizes d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0).cuda() # x_crop removed the torch.Variable interface, due to the deprecated Variable in torch 0.4.0 net(x_crop, set_source = True)
def SiamRPN_track(state, im): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans, out_mode='np') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) x_crop = Variable(transform(x_crop).unsqueeze(0)) target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def _pad_crop_resize_template(self): self.ret['im_h'] = self.ret['img_template'].shape[0] self.ret['im_w'] = self.ret['img_template'].shape[1] self.ret['p'].score_size = (self.ret['p'].instance_size - self.ret['p'].exemplar_size) / self.ret['p'].total_stride + 1 self.ret['p'].anchor = generate_anchor(self.ret['p'].total_stride, self.ret['p'].scales, self.ret['p'].ratios, int(self.ret['p'].score_size)) avg_chans = np.mean(self.ret['img_template'], axis=(0, 1)) # 图像均值 wc_z = self.ret['template_target_sz'][0] + self.ret['p'].context_amount * sum(self.ret['template_target_sz']) hc_z = self.ret['template_target_sz'][1] + self.ret['p'].context_amount * sum(self.ret['template_target_sz']) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(self.ret['img_template'], self.ret['template_target_pos'], self.ret['p'].exemplar_size, s_z, avg_chans) z = Variable(z_crop.unsqueeze(0)) # net.temple(z.cuda()) if self.ret['p'].windowing == 'cosine': window = np.outer(np.hanning(self.ret['p'].score_size), np.hanning(self.ret['p'].score_size)) elif self.ret['p'].windowing == 'uniform': window = np.ones((self.ret['p'].score_size, self.ret['p'].score_size)) window = np.tile(window.flatten(), self.ret['p'].anchor_num) self.ret['temple'] = z self.ret['avg_chans'] = avg_chans self.ret['window'] = window
def SiamRPN_train(state, im, old_pos, old_sz, gt_pos, gt_sz): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = old_pos #atually is the old ground truth of the last frame target_sz = old_sz wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z #the ratio between the in-model sizes and the real sizes d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0).cuda() cls_loss, box_loss = tracker_train(net, x_crop, target_pos, target_sz* scale_z, scale_z, p, gt_pos, gt_sz* scale_z) return cls_loss, box_loss
def SiamRPN_track(state, im): p = state['p'] # tracking config net = state['net'] avg_chans = state['avg_chans'] window = state['window'] # cosine window target_pos = state['target_pos'] # cx, cy of target in the previous frame target_sz = state['target_sz'] # w, h of target in the previous frame template_feat = state['template_feat'] wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z ###'Local to Global': if failure mode is activated then expand d_search; otherwise set d_search to normal d_search = (p.instance_size - p.exemplar_size) / 2 if state['score'] < 0.3: d_search *= 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) # where the third argument is the model size and the fourth is the orginal size in the raw image. target_pos, target_sz, score = tracker_eval_distractor_aware( x_crop.cuda(), target_sz * scale_z, scale_z, state) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state
def SiamRPN_track(state, im, f, last_result, att_per, def_per, image_save, iter=10): p = state['p'] net = state['net'] avg_chans = state['avg_chans'] window = state['window'] target_pos = state['target_pos'] target_sz = state['target_sz'] wc_z = target_sz[1] + p.context_amount * sum(target_sz) hc_z = target_sz[0] + p.context_amount * sum(target_sz) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) x_crop = x_crop.cuda() # adversarial attack if type(att_per) != type(0): att_per = att_per.cpu().detach().numpy() att_per = np.resize( att_per, (1, x_crop.shape[1], x_crop.shape[2], x_crop.shape[3])) att_per = torch.from_numpy(att_per).cuda() x_crop_init = x_crop + att_per * 1 x_crop_init = torch.clamp(x_crop_init, 0, 255) x_adv1 = rtaa_attack(net, x_crop_init, x_crop, last_result, target_pos, target_sz, scale_z, p, iteration=iter) att_per = x_adv1 - x_crop # adversarial defense if type(def_per) != type(0): def_per = def_per.cpu().detach().numpy() def_per = np.resize( def_per, (1, x_crop.shape[1], x_crop.shape[2], x_crop.shape[3])) def_per = torch.from_numpy(def_per).cuda() x_adv2_mask = x_adv1 + def_per * 0.01 x_adv2_mask = torch.clamp(x_adv2_mask, 0, 255) x_adv2 = rtaa_defnese(net, x_adv2_mask, x_adv1, last_result, target_pos, target_sz, scale_z, p, iteration=iter) def_per = x_adv2 - x_adv1 target_pos, target_sz, score = tracker_eval(net, x_adv2, target_pos, target_sz * scale_z, window, scale_z, p, f, last_result, state) target_pos[0] = max(0, min(state['im_w'], target_pos[0])) target_pos[1] = max(0, min(state['im_h'], target_pos[1])) target_sz[0] = max(10, min(state['im_w'], target_sz[0])) target_sz[1] = max(10, min(state['im_h'], target_sz[1])) state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = score return state, att_per, def_per
def redection(self, frame, s_x, img_mean, ratio): # get global instance img instance_img_global = get_subwindow_tracking( frame, self.pos, config.instance_size * ratio, ratio * s_x, img_mean) instance_img_global = self.transforms(instance_img_global)[ None, :, :, :] # get global instance feature with torch.cuda.device(self.gpu_id): instance_img_global = Variable(instance_img_global.cuda()) pred_cls, pred_reg = self.model(None, instance_img_global) # global response score_size = int((config.instance_size * ratio - config.exemplar_size) / config.total_stride + 1) global_response = torch.sigmoid(pred_cls).squeeze().view( -1).detach().cpu().numpy() global_best_id = np.argmax(global_response) global_anchor_id = global_best_id // (score_size * score_size) extreme_points = extreme_point_detection( global_response.reshape(config.anchor_num, score_size, score_size)[global_anchor_id]) # print(extreme_points) if len(extreme_points) <= 0: return None, None, None, None, None max_value = 0.5 for p in extreme_points: p[0] = float(p[0] - 8) * (config.total_stride * s_x * ratio) / config.instance_size p[1] = float(p[1] - 8) * (config.total_stride * s_x * ratio) / config.instance_size candidate_pos = self.pos + p # get candidate instance img instance_img = get_subwindow_tracking(frame, candidate_pos, config.instance_size, s_x, img_mean) instance_img = self.transforms(instance_img)[None, :, :, :] # get candidate instance feature with torch.cuda.device(self.gpu_id): instance_img = Variable(instance_img.cuda()) pred_cls, pred_reg = self.model(None, instance_img) # candidate offsets candidate_offsets = pred_reg.squeeze().view( 4, -1).detach().cpu().numpy() candidate_offsets[0] = candidate_offsets[ 0] * self.anchors[:, 2] + self.anchors[:, 0] candidate_offsets[1] = candidate_offsets[ 1] * self.anchors[:, 3] + self.anchors[:, 1] candidate_offsets[2] = np.exp( candidate_offsets[2]) * self.anchors[:, 2] candidate_offsets[3] = np.exp( candidate_offsets[3]) * self.anchors[:, 3] candidate_response = torch.sigmoid(pred_cls).squeeze().view( -1).detach().cpu().numpy() candidate_response_raw = candidate_response candidate_response = ( 1 - config.window_influence ) * candidate_response + config.window_influence * self.hann_window candidate_best_id = np.argmax(candidate_response) candidate_anchor_id = candidate_best_id // 289 candidate_response_map = candidate_response_raw[ candidate_anchor_id * 289:candidate_anchor_id * 289 + 289] if candidate_response.max() > max_value: best_candidate_pos = candidate_pos best_candidate_id = candidate_best_id best_candidate_anchor_id = candidate_anchor_id best_candidate_offsets = candidate_offsets best_candidate_response_map = candidate_response_map max_value = candidate_response.max() if max_value == 0.5: return None, None, None, None, None return best_candidate_pos, best_candidate_id, best_candidate_anchor_id, best_candidate_offsets, best_candidate_response_map
def SiamRPN_init(im, target_pos, target_sz, net, gtbox): state = dict() p = TrackerConfig() state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] state['ctr'] = 0 if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 p.score_size = (p.instance_size - p.exemplar_size) // p.total_stride + 1 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, p.score_size) avg_chans = np.mean(im, axis=(0, 1)) wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z = Variable(z_crop.unsqueeze(0)) net.temple(z.cuda()) s_z = np.sqrt(wc_z * hc_z) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # extract scaled crops for search region x at previous target position x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) label = np.zeros(im.shape) x_low, x_high = target_pos[0] - target_sz[0]/2, target_pos[0] + target_sz[0]/2 y_low, y_high = target_pos[1] - target_sz[1]/2, target_pos[1] + target_sz[1]/2 x_low, x_high, y_low, y_high = int(x_low), int(x_high), int(y_low), int(y_high) label[y_low:y_high,x_low:x_high,:] = 2 label = (get_subwindow_tracking(label, target_pos, p.instance_size, round(s_x), 0, out_mode="image")) label = cv2.split(label)[0] if p.instance_size == 271: label = cv2.resize(label, (19, 19)) else: label = cv2.resize(label, (21, 21)) label = torch.Tensor([(2-label)]*5 + [label]*5).unsqueeze(0) net.make_at(x_crop.cuda(), label.cuda()) if p.windowing == 'cosine': window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state
def SiamRPN_init(im, target_pos, target_sz, net): ## target_pos is (cx, cy) ## target_sz is (w, h) state = dict() p = TrackerConfig() state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 255 #271 p.delta_score_size = int( (p.instance_size - p.exemplar_size) / p.total_stride + 1) # size of the last feature map, expected to be 17 # all anchors of each aspect ratio and scale at each location are generated. p.anchors, _ = generate_all_anchors( (p.delta_score_size, p.delta_score_size), (p.instance_size, p.instance_size)) # of shape (dropping from 2420 down to 433, 4) avg_chans = np.mean(im, axis=(0, 1)) #??????????? wc_z = target_sz[0] + p.context_amount * sum( target_sz) # adding some context info hc_z = target_sz[1] + p.context_amount * sum( target_sz) # adding some context info s_z = round(np.sqrt(wc_z * hc_z)) # initialize the exemplar z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) z = Variable(z_crop.unsqueeze(0)) template_feat = net.template(z.cuda()) if p.windowing == 'cosine': # return the outer product of two hanning vectors, which is a matrix of the same size as the feature map of search region window = np.outer( np.hanning(p.delta_score_size), np.hanning(p.delta_score_size)) ############### p.score_size??? elif p.windowing == 'uniform': window = np.ones( (p.delta_score_size, p.delta_score_size)) ################## p.score_size??? # flatten and replicate the cosine window window = np.tile(window.flatten(), p.basic_anchor_num) state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz state['score'] = 1.0 # for distractor-aware incremental learning template_feat_cpu = template_feat.cpu().detach().numpy() state['template_feat'] = template_feat_cpu state['acc_beta_phi'] = template_feat_cpu state['acc_beta'] = 1.0 state['acc_beta_alpha_phi'] = np.zeros_like(template_feat_cpu) return state
def update(self, frame, gt, clf, random_shift, frame_num): """track object based on the previous frame Args: frame: an BGR image Returns: bbox: tuple of 1-based bounding box(xmin, ymin, xmax, ymax) """ ####################### if random_shift: pos_ = np.array([gt[0] + gt[2] / 2, gt[1] + gt[3] / 2 ]) # center x, center y, zero based max_translate = 2 * (self.s_x / config.instance_size) * config.total_stride self.pos[0] = np.random.uniform(pos_[0] - max_translate, pos_[0] + max_translate) self.pos[1] = np.random.uniform(pos_[1] - max_translate, pos_[1] + max_translate) ######################### # get instance img img_mean = tuple(map(int, frame.mean(axis=(0, 1)))) instance_img = get_subwindow_tracking(frame, self.pos, config.instance_size, python2round(self.s_x), img_mean) instance_img = self.transforms(instance_img)[None, :, :, :] # get instance feature with torch.cuda.device(self.gpu_id): instance_img = Variable(instance_img.cuda(), requires_grad=False) pred_cls, pred_reg = self.model(None, instance_img) #offsets offsets = pred_reg.squeeze().view(4, -1).detach().cpu().numpy() offsets[0] = offsets[0] * self.anchors[:, 2] + self.anchors[:, 0] offsets[1] = offsets[1] * self.anchors[:, 3] + self.anchors[:, 1] offsets[2] = np.exp(offsets[2]) * self.anchors[:, 2] offsets[3] = np.exp(offsets[3]) * self.anchors[:, 3] # scale and ratio penalty penalty = self._create_penalty(self.target_sz, offsets) # response max_value = pred_cls.max().detach().cpu().numpy() response = torch.sigmoid(pred_cls).squeeze().view( -1).detach().cpu().numpy() response_raw = response response = response * penalty response = (1 - config.window_influence ) * response + config.window_influence * self.hann_window best_id = np.argmax(response) # anomaly detection anchor_id = best_id // 289 response_map = response_raw[anchor_id * 289:anchor_id * 289 + 289] vis_heatmap(response_map.reshape(17, 17), max_value) clf_output = clf( torch.from_numpy(response_map).float().cuda()).data.cpu().numpy() state = np.argmax(clf_output) # print(state) # response_label = self.create_response_label(response_map.reshape(17, 17), self.s_x, anchor_id) # dae_output = sigmoid(dae(torch.from_numpy(response_map).float().cuda()).data.cpu().numpy()) # loss = np.mean((dae_output-response_label.flatten())**2) update_flag = 1 if state == 0: # print(' frame:'+str(frame_num)+' '+str(response_raw.max())) update_flag = 0 # self.counter_re += 1 # window_influence_re = 0.26 # response_re = response_raw * penalty # response_re = (1 - window_influence_re) * response_re + window_influence_re * self.hann_window # best_id = np.argmax(response_re) # if self.counter_re % 12 == 0 and self.counter_re != 0: # best_candidate_pos, best_candidate_id, best_candidate_anchor_id, best_candidate_offsets, \ # best_candidate_response_map = self.redection(frame, self.s_x, img_mean, 2) # if best_candidate_pos is not None: # clf_output_re = clf(torch.from_numpy(best_candidate_response_map).float().cuda()).data.cpu().numpy() # state_re = np.argmax(clf_output_re) # if state_re ==1 : # self.pos = best_candidate_pos # offsets = best_candidate_offsets # best_id = best_candidate_id # self.counter_re = 0 # else: # self.counter_re = 0 # peak location offset = offsets[:, best_id] * self.s_z / config.exemplar_size # update center self.pos += np.array([offset[0], offset[1]]) self.pos = np.clip(self.pos, 0, [frame.shape[1], frame.shape[0]]) # update scale lr = response[best_id] * config.scale_lr self.target_sz = (1 - lr) * self.target_sz + lr * np.array( [offset[2], offset[3]]) self.target_sz = np.clip(self.target_sz, 10, [frame.shape[1], frame.shape[0]]) wc_z = self.target_sz[1] + 0.5 * sum(self.target_sz) hc_z = self.target_sz[0] + 0.5 * sum(self.target_sz) self.s_z = np.sqrt(wc_z * hc_z) self.s_x = self.s_z * config.instance_size / config.exemplar_size #update_model if update_flag: exemplar_img = get_subwindow_tracking(frame, self.pos, config.exemplar_size, python2round(self.s_z), img_mean) exemplar_img = self.transforms(exemplar_img)[None, :, :, :] with torch.cuda.device(self.gpu_id): exemplar_img = Variable(exemplar_img.cuda(), requires_grad=False) self.model.update_model(exemplar_img) # return 1-indexed and left-top based bounding box bbox = np.array([ self.pos[0] - (self.target_sz[0]) / 2, self.pos[1] - (self.target_sz[1]) / 2, self.pos[0] + (self.target_sz[0]) / 2, self.pos[1] + (self.target_sz[1]) / 2 ]) return bbox, response_map
def SiamRPN_init(im, target_pos, target_sz, net): """ 输入第一帧 target_pos [center_x, center_y] target_sz [w, h] net return: state['im_h'] state['im_w'] state['p'] config for tracker state['net'] state['avg_chan'] 通道均值 """ state = dict() p = TrackerConfig() state['im_h'] = im.shape[0] state['im_w'] = im.shape[1] # Input size, if target is small, input should be large? if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004: p.instance_size = 287 # small object big search region else: p.instance_size = 271 # Input size - Template size # 计算每行有多少个感受野 # 每个感受野size instance_size # 每次移动total_stride p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1 p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, p.score_size) # 每在一维做平均则下降一维 # 1024 * 1024 * 3 => [x1, x2, x3] avg_chans = np.mean(im, axis=(0, 1)) # 扩大template范围 # 并且需要归一成正方形 # detection不需要归一 # w_ -> w + (w+h)/2 # h_ -> h + (w+h)/2 # s_ -> sqrt(w_ * h_) # target是实际bg 而s_z是相当于把bg变成了正方形 wc_z = target_sz[0] + p.context_amount * sum(target_sz) hc_z = target_sz[1] + p.context_amount * sum(target_sz) s_z = round(np.sqrt(wc_z * hc_z)) scale_z = p.exemplar_size / s_z d_search = (p.instance_size - p.exemplar_size) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad # initialize the exemplar # 将溢出部分用avg补充 # target_pos是中心点 # s_z是归一后正方形大小 # exempler_size是后面需要resize的127 z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans) template = z_crop.numpy().transpose((1, 2, 0)) state['template'] = template z = Variable(z_crop.unsqueeze(0)) x_crop = Variable( get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0)) #net.temple(z.cuda()) net(z.cuda(), x_crop.cuda()) if p.windowing == 'cosine': #outer (x1, x2) #x1中的每个值变为x2行向量的倍数 window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size)) elif p.windowing == 'uniform': window = np.ones((p.score_size, p.score_size)) window = np.tile(window.flatten(), p.anchor_num) #np.tile复制(row, col)倍 or directly copy x state['p'] = p state['net'] = net state['avg_chans'] = avg_chans state['window'] = window state['target_pos'] = target_pos state['target_sz'] = target_sz return state, z