def get_exemplar_images(images, exemplar_size, targets_pos=None): """Crop exemplar image from input images""" with tf.name_scope('get_exemplar_image'): batch_size, x_height, x_width = images.get_shape().as_list()[:3] z_height, z_width = exemplar_size if targets_pos is None: # crop from the center target_pos_single = [[get_center(x_height), get_center(x_width)]] targets_pos_ = tf.tile(target_pos_single, [batch_size, 1]) else: targets_pos_ = targets_pos # convert to top-left corner based coordinates top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height))) bottom = tf.to_int32(top + z_height) left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width))) right = tf.to_int32(left + z_width) def _slice(x): f, t, l, b, r = x c = f[t:b, l:r] return c exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype) exemplar_img.set_shape([batch_size, z_height, z_width, 3]) return exemplar_img
def build_extract_crops(self): model_config = self.model_config track_config = self.track_config context_amount = 0.5 size_z = model_config['z_image_size'] size_x = model_config['x_image_size'] num_scales = track_config['num_scales'] scales = np.arange(num_scales) - get_center(num_scales) assert np.sum(scales) == 0, 'scales should be symmetric' assert track_config['scale_step'] >= 1.0, 'scale step should be >= 1.0' search_factors = [track_config['scale_step']**x for x in scales] frame_sz = tf.shape(self.image) target_yx = self.target_bbox_feed[0:2] target_size = self.target_bbox_feed[2:4] avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum( base_z_size) base_s_z = tf.sqrt( tf.reduce_prod(base_z_context_size)) # Canoical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) d_search = (size_x - size_z) / 2.0 base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) boxes = [] for factor in search_factors: s_x = factor * base_s_x frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) box = tf.concat([topleft, bottomright], axis=0) boxes.append(box) boxes = tf.stack(boxes) scale_xs = [] for factor in search_factors: scale_x = base_scale_x / factor scale_xs.append(scale_x) self.scale_xs = tf.stack(scale_xs) image_minus_avg = tf.expand_dims(self.image - avg_chan, 0) image_cropped = tf.image.crop_and_resize( image_minus_avg, boxes, box_ind=tf.zeros((track_config['num_scales']), tf.int32), crop_size=[size_x, size_x]) self.images = image_cropped + avg_chan
def convert_bbox_format(bbox, to): x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height if to == 'top-left-based': x -= get_center(target_width) y -= get_center(target_height) elif to == 'center-based': y += get_center(target_height) x += get_center(target_width) else: raise ValueError("Bbox format: {} was not recognized".format(to)) return Rectangle(x, y, target_width, target_height)
def convert_bbox(bbox, to, offsetx, offsety): x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height if to == 'top-left-based': x -= get_center(target_width) y -= get_center(target_height) elif to == 'center-based': y += get_center(target_height) x += get_center(target_width) x += offsetx y += offsety else: raise NotImplementedError return Rectangle(x, y, target_width, target_height)
def build_template(self): model_config = self.model_config track_config = self.track_config examplar_images = get_exemplar_images( self.images, [model_config['z_image_size'], model_config['z_image_size']]) templates = self.get_image_embedding(examplar_images, deform=False) center_scale = int(get_center(track_config['num_scales'])) center_template = tf.identity(templates[center_scale]) templates = tf.stack( [center_template for _ in range(model_config['batch_size'])]) with tf.variable_scope('target_template'): template_fn = template_factory.get_network_fn( model_config['template_name'], weight_decay=model_config['weight_decay'], is_training=False) templates, _ = template_fn(templates) # Store template in Variable such that we don't have to feed this template. with tf.variable_scope('State'): state = tf.get_variable('exemplar', initializer=tf.zeros_like(templates), trainable=False) with tf.control_dependencies([templates]): self.init = tf.assign(state, templates, validate_shape=True) self.templates = state
def __init__(self, model, model_config, track_config): """Initializes the tracker. Args: model: Object encapsulating a trained track model. Must have methods inference_step(). For example, an instance of InferenceWrapperBase. model_config: track model configurations. track_config: tracking configurations. """ self.model = model self.model_config = model_config self.track_config = track_config self.z_image_size = model_config['z_image_size'] self.x_image_size = model_config['x_image_size'] self.r_embed_size = model_config['r_embed_size'] self.r_image_size = model_config['u_image_size'] self.num_scales = track_config['num_scales'] self.log_level = track_config['log_level'] logging.info('track num scales -- {}'.format( track_config['num_scales'])) scales = np.arange(self.num_scales) - get_center(self.num_scales) self.search_factors = [ self.track_config['scale_step']**x for x in scales ] # Cosine window window = np.dot(np.expand_dims(np.hanning(self.r_image_size), 1), np.expand_dims(np.hanning(self.r_image_size), 0)) self.window = window / np.sum(window) # normalize window
def extract_patch(inputs, patch_size, top_left=None): """Extract patch from inputs Tensor args: inputs: Tensor of shape [batch, height, width, feature_num] patch_size: [height, width] top_left: patch top_left positions in the input tensor, of shape [batch, 2] return: patches of shape [batch, height, width, feature_num] """ with tf.name_scope('extract_patch'): batch_size, x_height, x_width, feat_num = inputs.get_shape().as_list() z_height, z_width = patch_size if top_left is None: pos_single = [[get_center(x_height), get_center(x_width)]] patch_center_ = tf.tile(pos_single, [batch_size, 1]) # convert to top-left corner based coordinates top = tf.to_int32( tf.round(patch_center_[:, 0] - get_center(z_height))) left = tf.to_int32( tf.round(patch_center_[:, 1] - get_center(z_width))) else: top = tf.to_int32(top_left[:, 0]) left = tf.to_int32(top_left[:, 1]) bottom = tf.to_int32(top + z_height) right = tf.to_int32(left + z_width) def _slice(x): f, t, l, b, r = x c = f[t:b, l:r] return c patch = tf.map_fn(_slice, (inputs, top, left, bottom, right), dtype=inputs.dtype) # Restore some shape patch.set_shape([batch_size, z_height, z_width, feat_num]) return patch
def get_subwindow_avg(im, pos, model_sz, original_sz): # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower avg_chans = [ np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2]) ] if not original_sz: original_sz = model_sz sz = original_sz im_sz = im.shape # make sure the size is not too small assert im_sz[0] > 2 and im_sz[1] > 2 c = [get_center(s) for s in sz] # check out-of-bounds coordinates, and set them to avg_chans context_xmin = np.int(np.round(pos[1] - c[1])) context_xmax = np.int(context_xmin + sz[1] - 1) context_ymin = np.int(np.round(pos[0] - c[0])) context_ymax = np.int(context_ymin + sz[0] - 1) left_pad = np.int(np.maximum(0, -context_xmin)) top_pad = np.int(np.maximum(0, -context_ymin)) right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1)) bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1)) context_xmin = context_xmin + left_pad context_xmax = context_xmax + left_pad context_ymin = context_ymin + top_pad context_ymax = context_ymax + top_pad if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0: R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[0])) G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[1])) B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[2])) im = np.stack((R, G, B), axis=2) im_patch_original = im[context_ymin:context_ymax + 1, context_xmin:context_xmax + 1, :] if not np.array_equal(model_sz, original_sz): im_patch = imresize(im_patch_original, model_sz, interp='bilinear') else: im_patch = im_patch_original return im_patch, left_pad, top_pad, right_pad, bottom_pad
def build_search_image(self, image, bbox, scale_factor): context_amount = self.context_amount size_z = self.z_image_size size_x = self.x_image_size # image: [H,W,3] # bbox: [4], cy,cx,height,width frame_sz = tf.shape(image) target_yx = bbox[0:2] #y,x target_size = bbox[2:4] # height, width avg_chan = tf.reduce_mean(image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size) # w+2p, h+2p base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size)) # Canonical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) # s = sqrt(A**2/((w+2p)(h+2p)) d_search = (size_x - size_z) / 2.0 base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) s_x = scale_factor * base_s_x frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) crop_box = tf.concat([topleft, bottomright], axis=0) scale_x = base_scale_x / scale_factor image_minus_avg = tf.expand_dims(image - avg_chan, 0) image_cropped = tf.image.crop_and_resize(image_minus_avg, crop_box[None], box_ind=tf.zeros((1), tf.int32), crop_size=[size_x, size_x]) search_image = image_cropped + avg_chan search_image = search_image[0] # [1,H,W,3] --> [H,W,3] return search_image, scale_x, crop_box
def __init__(self, siamese_model, config): self.siamese_model = siamese_model self.config = config self.num_scales = self.config.num_scales logging.info('track num scales -- {}'.format(self.num_scales)) scales = np.arange(self.num_scales) - get_center(self.num_scales) self.search_factors = [self.config.scale_step ** x for x in scales] self.x_image_size = self.config.x_image_size # Search image size self.window = None # Cosine window self.log_level = self.config.log_level if config.net_type == 'cfcf': self.update_template = True else: self.update_template = False
def track(self, sess, first_bbox, frames, logdir='/tmp', write_summary=True): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = False logging.info('Tracking include first -- {}'.format(include_first)) if write_summary: summary_writer = tf.summary.FileWriter( osp.join(logdir, 'summary'), graph=sess.graph) self.siamese_model.build_summary(summary_writer) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.config.scale_penalty * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] if self.update_template: mmr = outputs['MMRs'][best_scale] if mmr > self.config.mmr_thresh: print('update templates MMRs={}'.format(mmr)) self.siamese_model.update(sess, input_feed) with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.config.window_influence response = (1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.config.upsample_factor disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.config.embed_stride # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.config.scale_damp # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) #--- END OF FRAME return reported_bboxs
def _construct_gt_response(response_size, batch_size, stride, gt_config=None): """Construct a batch of 2D ground truth response Args: response_size: a list or tuple with two elements [ho, wo] batch_size: an integer e.g. 16 stride: embedding stride e.g. 8 gt_config: configurations for ground truth generation return: a float tensor of shape [batch_size] + response_size """ with tf.variable_scope('construct_gt') as ct_scope: ho = response_size[0] wo = response_size[1] y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho) x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo) [Y, X] = tf.meshgrid(y, x) gt_type = gt_config['type'] if gt_type == 'gaussian': def _gaussian_2d(X, Y, sigma): x0, y0 = 0, 0 # the target position, i.e. the center return tf.exp(-0.5 * (((X - x0) / sigma)**2 + ((Y - y0) / sigma)**2)) sigma = gt_config['rPos'] / stride / 3.0 gt = _gaussian_2d(X, Y, sigma) elif gt_type == 'overlap': def _overlap_score(X, Y, stride, area): area_x, area_y = [tf.to_float(a) / stride for a in area] x_diff = (area_x - tf.abs(X)) y_diff = (area_y - tf.abs(Y)) # Intersection over union Z = x_diff * y_diff / (2 * area_x * area_y - x_diff * y_diff) # Remove negative intersections Z = tf.where(x_diff > 0, Z, tf.zeros_like(Z)) Z = tf.where(y_diff > 0, Z, tf.zeros_like(Z)) return Z area = [64, 64] logging.info('area are fixed for overlap gt type') gt = _overlap_score(X, Y, stride, area) elif gt_type == 'logistic': def _logistic_label(X, Y, rPos, rNeg): # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y)) # L2 dist dist_to_center = tf.abs(X) + tf.abs(Y) # Block dist Z = tf.where( dist_to_center <= rPos, tf.ones_like(X), tf.where(dist_to_center < rNeg, 0.5 * tf.ones_like(X), tf.zeros_like(X))) return Z rPos = gt_config['rPos'] / stride rNeg = gt_config['rNeg'] / stride gt = _logistic_label(X, Y, rPos, rNeg) else: raise NotImplementedError # Create a batch of ground truth response gt_expand = tf.reshape(gt, [1] + response_size) gt = tf.tile(gt_expand, [batch_size, 1, 1]) return gt
def track(self, sess, handle, logdir='/tmp'): """Runs tracking on a single image sequence. Args: sess: TensorFlow Session object. handle: a handle which generates image files and target pos in 1st frame, which mimic the interface of VOT. Returns: A list of Trajectories sorted by descending score. """ # Get initial target bounding box and convert to center based bbox = handle.region() bbox = convert_bbox(bbox, 'center-based') # Feed in the first frame image to set initial state. # Note we use different padding values for each image while the original implementation uses only the average value # of the first image for all images. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [handle.frame(), bbox_feed] frame2crop_scale = self.model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) # If track first frame include_first = get(self.track_config, 'include_first', False) logging.info('tracking include first -- {}'.format(include_first)) # Run tracking loop i = -1 # Processing the i th frame in image sequence, # note that we will use the first image twice in total. # 1. It is used to initialize the tracker # 2. It is used as a test example for tracker, the detected result won't affect the final metrics though. # this is needed because both OTB and VOT benchmark require a list of tracking results equal to the # length of the test image sequences including the first image. while True: # Read new image filename = handle.frame() if not filename: if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) break # All image files are processed, exiting while loop i += 1 if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. # Prepare input feed bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] # Feed in input outputs, metadata = self.model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] # Choose the scale whole response map has the highest peak if self.num_scales > 1: current_scale_idx = int(get_center(self.num_scales)) best_scale = current_scale_idx best_peak = -np.inf for s in range(self.num_scales): this_response = response[s] this_peak = np.max(this_response[:]) # Penalize change of scale if s != current_scale_idx: this_peak *= self.track_config['scale_penalty'] if this_peak > best_peak: best_peak = this_peak best_scale = s else: best_scale = 0 response = response[best_scale] if self.log_level > 0: np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) # Normalize response with np.errstate( all='raise'): # Raise error if something goes wrong logging.debug('mean response: {}'.format( np.mean(response))) response = response - np.min(response) response = response / np.sum(response) # Apply windowing window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window if self.log_level > 0: np.save( osp.join(logdir, 'response_windowed{}.npy'.format(i)), response) # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(self.r_image_size) # ... in instance feature space ... upsampling_factor = self.r_image_size / self.r_embed_size disp_instance_feat = disp_instance_final / upsampling_factor # ... Avoid empty position ... r_radius = int(self.r_embed_size / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' logging.debug('search_position: {}'.format( current_target_state.search_pos)) # I used to put this at the beginning of the loop, which makes the code visually looks better. # But it is also more demanding to really understand the logic behind that and prone to make # bugs. My opinion now is *easy is better than concise, if you can't have both.* # Record tracked target position reported_bbox = convert_bbox(current_target_state.bbox, 'top-left-based') handle.report(reported_bbox)