def construct_seg_score_maps(response_size, bboxes, im_size): """Construct a batch of groundtruth score maps Args: response_size: A list or tuple with two elements [ho, wo] bboxes: Labels for bounding boxes im_size: Image size Return: A float tensor of shape [batch_size] + response_size """ with tf.name_scope('construct_gt'): ho = response_size[0] wo = response_size[1] y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho) x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo) [X, Y] = tf.meshgrid(x, y) def _logistic_label(Y, X, H, W): Y = tf.abs(Y) X = tf.abs(X) Z = tf.where(Y <= H * ho / im_size[0] / 2, tf.ones_like(Y), tf.zeros_like(Y)) Z = tf.where(X <= W * wo / im_size[1] / 2, Z, tf.zeros_like(X)) return Z gt = tf.map_fn(lambda x: _logistic_label(Y, X, tf.to_float(x[0]), tf.to_float(x[1])), bboxes, dtype=tf.float32) return gt
def get_exemplar_images(images, exemplar_size, targets_pos=None): """Crop exemplar image from input images""" with tf.name_scope('get_exemplar_image'): batch_size, x_height, x_width = images.get_shape().as_list()[:3] z_height, z_width = exemplar_size if targets_pos is None: target_pos_single = [[get_center(x_height), get_center(x_width)]] targets_pos_ = tf.tile(target_pos_single, [batch_size, 1]) else: targets_pos_ = targets_pos # convert to top-left corner based coordinates top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height))) bottom = tf.to_int32(top + z_height) left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width))) right = tf.to_int32(left + z_width) def _slice(x): f, t, l, b, r = x c = f[t:b, l:r] return c exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype) exemplar_img.set_shape([batch_size, z_height, z_width, 3]) return exemplar_img
def build_search_images(self): """Crop search images from the input image based on the last target position 1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2 2. Crop an image patch as large as x_image_size centered at the target center. 3. If the cropped image region is beyond the boundary of the input image, mean values are padded. """ size_z = 127 size_x = 255 context_amount = 0.5 num_scales = 3 scales = np.arange(num_scales) - get_center(num_scales) assert np.sum(scales) == 0, 'scales should be symmetric' search_factors = [1.0375**x for x in scales] frame_sz = tf.shape(self.image) target_yx = self.target_bbox_feed[0:2] target_size = self.target_bbox_feed[2:4] avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum( base_z_size) base_s_z = tf.sqrt( tf.reduce_prod(base_z_context_size)) # Canonical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) d_search = (size_x - size_z) / 2.0 base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) boxes = [] for factor in search_factors: s_x = factor * base_s_x frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) box = tf.concat([topleft, bottomright], axis=0) boxes.append(box) boxes = tf.stack(boxes) scale_xs = [] for factor in search_factors: scale_x = base_scale_x / factor scale_xs.append(scale_x) self.scale_xs = tf.stack(scale_xs) # Note we use different padding values for each image # while the original implementation uses only the average value # of the first image for all images. image_minus_avg = tf.expand_dims(self.image - avg_chan, 0) image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes, box_ind=tf.zeros((3), tf.int32), crop_size=[size_x, size_x]) self.search_images = image_cropped + avg_chan
def build_search_images(self): """Crop search images from the input image based on the last target position 1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2 2. Crop an image patch as large as x_image_size centered at the target center. 3. If the cropped image region is beyond the boundary of the input image, mean values are padded. """ model_config = self.model_config track_config = self.track_config size_z = model_config['z_image_size'] size_x = track_config['x_image_size'] context_amount = 0.5 num_scales = track_config['num_scales'] scales = np.arange(num_scales) - get_center(num_scales) assert np.sum(scales) == 0, 'scales should be symmetric' search_factors = [track_config['scale_step'] ** x for x in scales] frame_sz = tf.shape(self.image) target_yx = self.target_bbox_feed[0:2] target_size = self.target_bbox_feed[2:4] avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size) base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size)) # Canonical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) d_search = (size_x - size_z) / 2.0 base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) boxes = [] for factor in search_factors: s_x = factor * base_s_x frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) box = tf.concat([topleft, bottomright], axis=0) boxes.append(box) boxes = tf.stack(boxes) scale_xs = [] for factor in search_factors: scale_x = base_scale_x / factor scale_xs.append(scale_x) self.scale_xs = tf.stack(scale_xs) # Note we use different padding values for each image # while the original implementation uses only the average value # of the first image for all images. image_minus_avg = tf.expand_dims(self.image - avg_chan, 0) image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes, box_ind=tf.zeros((track_config['num_scales']), tf.int32), crop_size=[size_x, size_x]) self.search_images = image_cropped + avg_chan
def build_search_images(self): """Crop search images from the input image based on the last target position 1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2 2. Crop an image patch as large as x_image_size centered at the target center. 3. If the cropped image region is beyond the boundary of the input image, mean values are padded. """ model_config = self.model_config track_config = self.track_config ratio = self.target_bbox_feed[2] / self.target_bbox_feed[3] size_z = model_config['z_image_size'] size_x = self.size_x_feed context_amount = 0.3 num_scales = track_config['num_scales'] scales = np.arange(num_scales) - get_center(num_scales) assert np.sum(scales) == 0, 'scales should be symmetric' search_factors = tf.split(self.scale_feed, 3) frame_sz = tf.shape(self.image) target_yx = self.target_bbox_feed[0:2] target_size = self.target_bbox_feed[2:4] avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size) base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size)) # Canonical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) d_search = tf.div(tf.to_float(size_x) - tf.to_float(size_z), 2.0) base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) boxes = [] for factor in search_factors: s_x = factor * base_s_x frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) box = tf.concat([topleft, bottomright], axis=0) boxes.append(box) boxes = tf.stack(boxes) self.target_size = target_size * 127. / base_s_z scale_xs = [] for factor in search_factors: scale_x = base_scale_x / factor scale_xs.append(scale_x) self.scale_xs = tf.stack(scale_xs) # Pad with average value of the image image_minus_avg = tf.expand_dims(self.image - avg_chan, 0) image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes, box_ind=tf.zeros((track_config['num_scales']), tf.int32), crop_size=[size_x, size_x]) self.search_images = image_cropped + avg_chan
def convert_bbox_format(bbox, to): x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height if to == 'top-left-based': x -= get_center(target_width) y -= get_center(target_height) elif to == 'center-based': y += get_center(target_height) x += get_center(target_width) else: raise ValueError("Bbox format: {} was not recognized".format(to)) return Rectangle(x, y, target_width, target_height)
def build_examplar_ini(self): model_config = self.model_config track_config = self.track_config # Exemplar image lies at the center of the search image in the first frame exemplar_images = get_exemplar_images( self.search_images, [model_config['z_image_size'], model_config['z_image_size']]) [templates_ini, templates] = self.get_image_embedding(exemplar_images) center_scale = int(get_center(track_config['num_scales'])) center_template = tf.identity(templates[center_scale]) templates = tf.stack( [center_template for _ in range(track_config['num_scales'])]) center_template_ini = tf.identity(templates_ini[center_scale]) templates_ini = tf.stack( [center_template_ini for _ in range(track_config['num_scales'])]) with tf.variable_scope('target_examplar'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state = tf.get_variable( 'exemplar', initializer=tf.zeros(templates_ini.get_shape().as_list(), dtype=templates_ini.dtype), trainable=False) with tf.control_dependencies([templates_ini]): self.init_examplar = tf.assign(state, templates_ini, validate_shape=True) self.examplar_ini = state
def construct_gt_score_maps(response_size, batch_size, stride, gt_config=None, n_out=1): """Construct a batch of groundtruth score maps Args: response_size: A list or tuple with two elements [ho, wo] batch_size: An integer e.g., 16 stride: Embedding stride e.g., 8 gt_config: Configurations for groundtruth generation Return: A float tensor of shape [batch_size] + response_size """ with tf.name_scope('construct_gt'): ho = response_size[0] wo = response_size[1] y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho) x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo) [Y, X] = tf.meshgrid(y, x) def _logistic_label(X, Y, rPos, rNeg): # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y)) # L2 metric dist_to_center = tf.abs(X) + tf.abs(Y) # Block metric Z = tf.where( dist_to_center <= rPos, tf.ones_like(X), tf.where(dist_to_center < rNeg, 0.5 * tf.ones_like(X), tf.zeros_like(X))) return Z rPos = gt_config['rPos'] / stride rNeg = gt_config['rNeg'] / stride gt = _logistic_label(X, Y, rPos, rNeg) # Duplicate a batch of maps if n_out > 1: gt_expand = tf.reshape(gt, [1] + response_size + [1]) gt = tf.tile(gt_expand, [batch_size, 1, 1, n_out]) else: gt_expand = tf.reshape(gt, [1] + response_size) gt = tf.tile(gt_expand, [batch_size, 1, 1]) return gt
def track_vot_init(self, sess, first_bbox, frame): # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frame, bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state self.vot_original_target_height = bbox.height self.vot_original_target_width = bbox.width self.vot_search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) self.vot_current_target_state = TargetState( bbox=bbox, search_pos=self.vot_search_center, scale_idx=int(get_center(self.num_scales)))
def init(self, sess, frame, first_bbox, logdir='/tmp'): # Get initial target bounding box and convert to center based self.i = 0 first_bbox = Rectangle(first_bbox[0], first_bbox[1], first_bbox[2], first_bbox[3]) bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [ frame, bbox_feed, self.x_image_size_init, self.search_factors_init ] frame2crop_scale, self.image_z = self.siamese_model.initialize( sess, input_feed) imwrite(osp.join(logdir, 'aimagez.jpg'), cv2.cvtColor(self.image_z, cv2.COLOR_RGB2BGR)) # Storing target state self.original_target_height = bbox.height self.original_target_width = bbox.width self.search_center = np.array([ get_center(self.x_image_size_init), get_center(self.x_image_size_init) ]) self.current_target_state = TargetState( bbox=bbox, search_pos=self.search_center, scale_idx=int(get_center(self.num_scales))) self.store_thresh = 0.9 self.conf_thresh = 0.7 self.bound_thresh = 0.5 self.sup_thresh = 0.1 self.mem_count = 0 self.update_delay = 0 self.lost = 0 self.x_image_size = self.x_image_size_init self.image_c = None self.moved2border = False self.prev_score = self.conf_thresh + 0.01 return True
def initialize(self, sess, first_bbox, frame, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frame, bbox_feed] self.frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state self.original_target_height = bbox.height self.original_target_width = bbox.width self.search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) self.current_target_state = TargetState(bbox=bbox, search_pos=self.search_center, scale_idx=int(get_center(self.num_scales))) self.logdir = logdir self.frame_cnt = 0
def __init__(self, siamese_model, model_config, track_config): self.siamese_model = siamese_model self.model_config = model_config self.track_config = track_config self.num_scales = track_config['num_scales'] logging.info('track num scales -- {}'.format(self.num_scales)) scales = np.arange(self.num_scales) - get_center(self.num_scales) self.search_factors_init = [ self.track_config['scale_step']**x for x in scales ] scales_5 = np.arange(self.num_scales + 2) - get_center(self.num_scales + 2) self.search_factors_init5 = [ self.track_config['scale_step']**x for x in scales_5 ] self.x_image_size_init = track_config[ 'x_image_size'] # Search image size self.window = None # Cosine window self.log_level = track_config['log_level']
def build_examplar(self): model_config = self.model_config track_config = self.track_config # Exemplar image lies at the center of the search image in the first frame exemplar_images = get_exemplar_images( self.search_images, [model_config['z_image_size'], model_config['z_image_size']]) [_, templates] = self.get_image_embedding(exemplar_images, reuse=tf.AUTO_REUSE) center_scale = int(get_center(track_config['num_scales'])) self.examplar = tf.identity(templates[center_scale]) self.examplar = tf.nn.relu(self.examplar)
def __init__(self, siamese_model, model_config, track_config): self.siamese_model = siamese_model self.model_config = model_config self.track_config = track_config self.num_scales = track_config['num_scales'] logging.info('track num scales -- {}'.format(self.num_scales)) scales = np.arange(self.num_scales) - get_center(self.num_scales) self.search_factors = [self.track_config['scale_step'] ** x for x in scales] self.x_image_size = track_config['x_image_size'] # Search image size self.window = None # Cosine window self.log_level = track_config['log_level']
def construct_gt_score_maps(response_size, batch_size, stride, gt_config=None): """Construct a batch of groundtruth score maps Args: response_size: A list or tuple with two elements [ho, wo] batch_size: An integer e.g., 16 stride: Embedding stride e.g., 8 gt_config: Configurations for groundtruth generation Return: A float tensor of shape [batch_size] + response_size """ with tf.name_scope('construct_gt'): ho = response_size[0] wo = response_size[1] y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho) x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo) [Y, X] = tf.meshgrid(y, x) def _logistic_label(X, Y, rPos, rNeg): # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y)) # L2 metric dist_to_center = tf.abs(X) + tf.abs(Y) # Block metric Z = tf.where(dist_to_center <= rPos, tf.ones_like(X), tf.where(dist_to_center < rNeg, 0.5 * tf.ones_like(X), tf.zeros_like(X))) return Z rPos = gt_config['rPos'] / stride rNeg = gt_config['rNeg'] / stride gt = _logistic_label(X, Y, rPos, rNeg) # Duplicate a batch of maps gt_expand = tf.reshape(gt, [1] + response_size) gt = tf.tile(gt_expand, [batch_size, 1, 1]) return gt
def get_subwindow_avg(im, pos, model_sz, original_sz): # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower avg_chans = [ np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2]) ] if not original_sz: original_sz = model_sz sz = original_sz im_sz = im.shape # make sure the size is not too small assert im_sz[0] > 2 and im_sz[1] > 2 c = [get_center(s) for s in sz] # check out-of-bounds coordinates, and set them to avg_chans context_xmin = np.int(np.round(pos[1] - c[1])) context_xmax = np.int(context_xmin + sz[1] - 1) context_ymin = np.int(np.round(pos[0] - c[0])) context_ymax = np.int(context_ymin + sz[0] - 1) left_pad = np.int(np.maximum(0, -context_xmin)) top_pad = np.int(np.maximum(0, -context_ymin)) right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1)) bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1)) context_xmin = context_xmin + left_pad context_xmax = context_xmax + left_pad context_ymin = context_ymin + top_pad context_ymax = context_ymax + top_pad if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0: R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[0])) G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[1])) B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[2])) im = np.stack((R, G, B), axis=2) im_patch_original = im[context_ymin:context_ymax + 1, context_xmin:context_xmax + 1, :] if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]): im_patch = resize(im_patch_original, tuple(model_sz)) else: im_patch = im_patch_original return im_patch, left_pad, top_pad, right_pad, bottom_pad
def get_subwindow_avg(im, pos, model_sz, original_sz): # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])] if not original_sz: original_sz = model_sz sz = original_sz im_sz = im.shape # make sure the size is not too small assert im_sz[0] > 2 and im_sz[1] > 2 c = [get_center(s) for s in sz] # check out-of-bounds coordinates, and set them to avg_chans context_xmin = np.int(np.round(pos[1] - c[1])) context_xmax = np.int(context_xmin + sz[1] - 1) context_ymin = np.int(np.round(pos[0] - c[0])) context_ymax = np.int(context_ymin + sz[0] - 1) left_pad = np.int(np.maximum(0, -context_xmin)) top_pad = np.int(np.maximum(0, -context_ymin)) right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1)) bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1)) context_xmin = context_xmin + left_pad context_xmax = context_xmax + left_pad context_ymin = context_ymin + top_pad context_ymax = context_ymax + top_pad if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0: R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[0])) G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[1])) B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)), 'constant', constant_values=(avg_chans[2])) im = np.stack((R, G, B), axis=2) im_patch_original = im[context_ymin:context_ymax + 1, context_xmin:context_xmax + 1, :] if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]): im_patch = resize(im_patch_original, tuple(model_sz)) else: im_patch = im_patch_original return im_patch, left_pad, top_pad, right_pad, bottom_pad
def __init__(self, siamese_model, model_config, track_config): self.siamese_model = siamese_model self.model_config = model_config self.track_config = track_config self.num_scales = track_config['num_scales'] logging.info('track num scales -- {}'.format(self.num_scales)) scales = np.arange(self.num_scales) - get_center(self.num_scales) self.search_factors = [ self.track_config['scale_step']**x for x in scales ] self.x_image_size = track_config['x_image_size'] # Search image size self.window = None # Cosine window self.log_level = track_config['log_level'] self.vot_original_target_height = None self.vot_original_target_width = None self.vot_current_target_state = None self.vot_search_center = None
def get_bg_images(images, exemplar_size, original_size): x_height, x_width = original_size z_height, z_width = exemplar_size exem_r = int(get_center(z_height)) topleft = get_exemplar_images(images, exemplar_size, targets_pos=np.array([[exem_r, exem_r]])) topright = get_exemplar_images(images, exemplar_size, targets_pos=np.array( [[exem_r, x_width - exem_r - 1]])) bottomleft = get_exemplar_images(images, exemplar_size, targets_pos=np.array( [[x_height - exem_r - 1, exem_r]])) bottomright = get_exemplar_images( images, exemplar_size, targets_pos=np.array([[x_height - exem_r - 1, x_width - exem_r - 1]])) return tf.concat([topleft, topright, bottomleft, bottomright], 0)
def build_template(self): model_config = self.model_config track_config = self.track_config # Exemplar image lies at the center of the search image in the first frame exemplar_images = get_exemplar_images(self.search_images, [model_config['z_image_size'], model_config['z_image_size']]) templates = self.get_image_embedding(exemplar_images) center_scale = int(get_center(track_config['num_scales'])) center_template = tf.identity(templates[center_scale]) templates = tf.stack([center_template for _ in range(track_config['num_scales'])]) with tf.variable_scope('target_template'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state = tf.get_variable('exemplar', initializer=tf.zeros(templates.get_shape().as_list(), dtype=templates.dtype), trainable=False) with tf.control_dependencies([templates]): self.init = tf.assign(state, templates, validate_shape=True) self.templates = state
def build_template(self): # Exemplar image lies at the center of the search image in the first frame exemplar_images = get_exemplar_images(self.search_images, [127, 127]) templates = self.get_image_embedding(exemplar_images, self.classid) center_scale = int(get_center(3)) center_template = tf.identity(templates[center_scale]) templates = tf.stack([center_template for _ in range(3)]) with tf.variable_scope('target_template'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state = tf.get_variable('exemplar', initializer=tf.zeros( templates.get_shape().as_list(), dtype=templates.dtype), trainable=False) with tf.control_dependencies([templates]): self.init = tf.assign(state, templates, validate_shape=True) self.templates = state
def build_template(self): model_config = self.model_config track_config = self.track_config size_z = model_config['z_image_size'] ratio = self.target_size[0] / self.target_size[1] # Exemplar image lies at the center of the search image in the first frame center_scale = int(get_center(track_config['num_scales'])) search_images = tf.expand_dims(self.search_images[center_scale], 0) exemplar_images = get_exemplar_images( search_images, [size_z, size_z], np.array([[ get_center(track_config['x_image_size']), get_center(track_config['x_image_size']) ]])) def boundary_suppression(embeds, embeds2, ratio): offsets = tf.cond( tf.greater(ratio, 1.5), lambda: [0, 4, 0, 4], lambda: tf.cond(tf.less(ratio, 0.67), lambda: [4, 0, 4, 0], lambda: [2, 2, 2, 2])) embeds = tf.image.resize_image_with_crop_or_pad( embeds, t_shape[1] - offsets[0], t_shape[2] - offsets[1]) embeds = tf.image.resize_image_with_crop_or_pad( embeds, t_shape[1], t_shape[2]) embeds2 = tf.image.resize_image_with_crop_or_pad( embeds2, t_shape2[1] - offsets[2], t_shape2[2] - offsets[3]) embeds2 = tf.image.resize_image_with_crop_or_pad( embeds2, t_shape2[1], t_shape2[2]) return embeds, embeds2 def background_suppression(embeds, ratio): offsets = tf.cond( tf.greater(ratio, 1.5), # 1.2 / 0.83; 1.5 / 0.67 lambda: [1., 1.2 / ratio], lambda: tf.cond( tf.less(ratio, 0.67), lambda: [1.2 * ratio, 1.], lambda: tf .cond( tf.greater(ratio, 1.2), lambda: [1., 1.1 / ratio], lambda: tf.cond(tf.less(ratio, 0.83), lambda: [ 1.1 * ratio, 1. ], lambda: [0.7, 0.7])))) h = tf.cast(size_z * offsets[0], tf.int32) w = tf.cast(size_z * offsets[1], tf.int32) embeds_mean = tf.reduce_mean(embeds, axis=(0, 1), keepdims=True) embeds = embeds - embeds_mean embeds = tf.image.resize_image_with_crop_or_pad(embeds, h, w) embeds = tf.image.resize_image_with_crop_or_pad( embeds, size_z, size_z) return embeds + embeds_mean exemplar_images = tf.map_fn( lambda x: background_suppression(x[0], x[1]), (exemplar_images, tf.expand_dims(ratio, 0)), dtype=exemplar_images.dtype) self.exemplar_images = exemplar_images templates, templates2 = self.get_image_embedding(exemplar_images) t_shape = templates.get_shape().as_list() t_shape2 = templates2.get_shape().as_list() templates, templates2 = tf.map_fn( lambda x: boundary_suppression(x[0], x[1], x[2]), (templates, templates2, tf.expand_dims(ratio, 0)), dtype=(templates.dtype, templates2.dtype)) templates = templates templates2 = templates2 with tf.variable_scope('target_template'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state = tf.get_variable('exemplar', initializer=tf.zeros( templates.get_shape().as_list(), dtype=templates.dtype), trainable=False) state2 = tf.get_variable('exemplar2', initializer=tf.zeros( templates2.get_shape().as_list(), dtype=templates2.dtype), trainable=False) with tf.control_dependencies([templates, templates2]): self.init = tf.assign(state, templates, validate_shape=True) self.init2 = tf.assign(state2, templates2, validate_shape=True) self.templates = state self.templates2 = state2 # Store Pseudo Templates def _euc_distance(x, z): z = tf.expand_dims(z, 0) return tf.reduce_sum(tf.abs(x - z), -1) num_k = 3 # 3 state_pseu = [] state_pseu2 = [] image_pseu = [] self.init_pseu = [] self.init2_pseu = [] self.init_pseu_img = [] for i in range(num_k): state_pseu.append( tf.get_variable('exemplar_pseu' + str(i), initializer=tf.zeros( templates.get_shape().as_list(), dtype=templates.dtype), trainable=False)) state_pseu2.append( tf.get_variable('exemplar2_pseu' + str(i), initializer=tf.zeros( templates2.get_shape().as_list(), dtype=templates2.dtype), trainable=False)) image_pseu.append( tf.get_variable( 'exemplar_pseu_image' + str(i), initializer=tf.zeros( exemplar_images.get_shape().as_list(), dtype=exemplar_images.dtype), trainable=False)) with tf.control_dependencies( [templates, templates2, exemplar_images]): self.init_pseu.append( tf.assign(state_pseu[i], templates, validate_shape=True)) self.init2_pseu.append( tf.assign(state_pseu2[i], templates2, validate_shape=True)) self.init_pseu_img.append( tf.assign(image_pseu[i], exemplar_images, validate_shape=True)) self.image_pseu = image_pseu self.pseu_temp = state_pseu self.pseu_temp2 = state_pseu2 state_pseus = tf.concat([self.templates] + state_pseu + [templates], 0) sp_shape = state_pseus.get_shape().as_list()[0] state_pseus_c = tf.reshape(state_pseus, [sp_shape, -1]) state_pseus_dis = tf.map_fn( lambda x: _euc_distance(state_pseus_c, x), state_pseus_c, dtype=state_pseus_c.dtype) state_pseus_dis = tf.reshape(state_pseus_dis, [sp_shape, sp_shape])[1:, :] state_pseus_dis = tf.reduce_sum(state_pseus_dis, -1) self.state_pseus_dis = state_pseus_dis _, state_pseus_idx = tf.nn.top_k(state_pseus_dis, k=len(state_pseu)) image_pseu_extra = tf.concat(image_pseu + [exemplar_images], 0) state_pseus2 = tf.concat(state_pseu2 + [templates2], 0) self.up_img = [] self.up_pseu = [] self.up2_pseu = [] for i in range(len(state_pseu)): with tf.control_dependencies([ state_pseus_idx, image_pseu_extra, state_pseus, state_pseus2 ]): self.up_pseu.append( tf.assign(state_pseu[i], tf.expand_dims( state_pseus[state_pseus_idx[i] + 1], 0), validate_shape=True)) self.up2_pseu.append( tf.assign(state_pseu2[i], tf.expand_dims( state_pseus2[state_pseus_idx[i]], 0), validate_shape=True)) self.up_img.append( tf.assign(image_pseu[i], tf.expand_dims( image_pseu_extra[state_pseus_idx[i]], 0), validate_shape=True))
def main(_): # load model model_config, _, track_config = load_cfgs(CHECKPOINT) track_config["log_level"] = 0 track_config["is_video"] = True g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, CHECKPOINT) g.finalize() if not os.path.isdir(track_config['log_dir']): tf.logging.info('Creating inference directory: %s', track_config['log_dir']) mkdir_p(track_config['log_dir']) gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: restore_fn(sess) tracker = Tracker(model, model_config=model_config, track_config=track_config) video_name = os.path.basename(FLAGS.video_path) video_log_dir = os.path.join(track_config["log_dir"], video_name) mkdir_p(video_log_dir) if str(FLAGS.video_path) in ["0", "1"]: # read from camera video_path = int(FLAGS.video_path) with_camera = True else: # read from video video_path = glob(os.path.join(FLAGS.video_path, "*.mp4"))[0] with_camera = False video_capture = cv2.VideoCapture(video_path) bb = [-1, -1, -1, -1] cv2.namedWindow("template") cv2.setMouseCallback("template", draw_init_box, bb) trajectory = [] f_count = 0 f_rate = 0 start_time = time.time() while True: # capture frame by frame ret_, frame = video_capture.read() if ret_ == False: continue f_width, f_height = [ int(a) for a in FLAGS.video_resolution.split("*") ] try: o_frame = cv2.resize(frame, (f_width, f_height), interpolation=cv2.INTER_CUBIC) except: break i_frame = cv2.cvtColor(o_frame, cv2.COLOR_BGR2RGB) # cv2.imwrite("test.jpg",o_frame) # pdb.set_trace() if f_count == 0: # initialize the tracker # wait for drawing init box while True: init_frame = o_frame.copy() cv2.imshow("template", init_frame) k = cv2.waitKey(0) if k == 32: # space cx = int((bb[0] + bb[2]) / 2) cy = int((bb[1] + bb[3]) / 2) w = int(bb[2] - bb[0]) h = int(bb[3] - bb[1]) # Rectangle: [x,y,width,height] init_bb = Rectangle(cx - 1, cy - 1, w, h) # 0-index in python draw_box(init_frame, init_bb, "exemplar") break first_box = convert_bbox_format(init_bb, "center-based") bbox_feed = [ first_box.y, first_box.x, first_box.height, first_box.width ] input_feed = [i_frame, bbox_feed] frame2crop_scale = tracker.siamese_model.initialize( sess, input_feed) # Storing target state original_target_height = first_box.height original_target_width = first_box.width search_center = np.array([ get_center(tracker.x_image_size), get_center(tracker.x_image_size) ]) current_target_state = TargetState( bbox=first_box, search_pos=search_center, scale_idx=int(get_center(tracker.num_scales))) # setup initialized params current_param = { "original_target_width": original_target_width, "original_target_height": original_target_height, "search_center": search_center, "current_target_state": current_target_state } bbox, current_param = tracker.track_frame(sess, i_frame, current_param, video_log_dir) # add overlays end_time = time.time() f_rate = int(1 / (end_time - start_time)) start_time = time.time() draw_box(o_frame, bbox) cv2.putText(o_frame, str(f_rate) + "fps", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), thickness=2, lineType=2) trajectory.append(bbox) f_count += 1 cv2.imshow("Real-time Ouput", o_frame) cv2.imshow("template", init_frame) # if f_count > 30: # cv2.imwrite("test.jpg",o_frame) # pdb.set_trace() if cv2.waitKey(1) & 0xFF == ord("q"): cv2.imwrite("./assets/instance.jpg", o_frame) cv2.imwrite("./assets/exemplar.jpg", init_frame) break video_capture.release() cv2.destroyAllWindows() # save track results # pdb.set_trace() with open(os.path.join(video_log_dir, "track_rect.txt"), "w") as f: for region in trajectory: rect_str = "{},{},{},{}\n".format(region.x + 1, region.y + 1, region.width, region.height) f.write(rect_str)
def build_search_images(self): """Crop search images from the input image based on the last target position 1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2 2. Crop an image patch as large as x_image_size centered at the target center. 3. If the cropped image region is beyond the boundary of the input image, mean values are padded. """ model_config = self.model_config track_config = self.track_config size_z = model_config['z_image_size'] size_x = track_config['x_image_size'] context_amount = 0.5 num_scales = track_config['num_scales'] scales = np.arange(num_scales) - get_center(num_scales) assert np.sum(scales) == 0, 'scales should be symmetric' search_factors = [track_config['scale_step']**x for x in scales] frame_sz = tf.shape(self.image) target_yx = self.target_bbox_feed[0:2] target_size = self.target_bbox_feed[2:4] avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan') # Compute base values base_z_size = target_size base_z_context_size = base_z_size + context_amount * tf.reduce_sum( base_z_size) base_s_z = tf.sqrt( tf.reduce_prod(base_z_context_size)) # Canonical size base_scale_z = tf.div(tf.to_float(size_z), base_s_z) d_search = (size_x - size_z) / 2.0 base_pad = tf.div(d_search, base_scale_z) base_s_x = base_s_z + 2 * base_pad base_scale_x = tf.div(tf.to_float(size_x), base_s_x) # Note we use different padding values for each image # while the original implementation uses only the average value # of the first image for all images. image_minus_avg = self.image - avg_chan # for original implementation, fail on TX2 # # image_minus_avg = tf.expand_dims(image_minus_avg, 0) # boxes = [] # for factor in search_factors: # s_x = factor * base_s_x # frame_sz_1 = tf.to_float(frame_sz[0:2] - 1) # topleft = tf.div(target_yx - get_center(s_x), frame_sz_1) # bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1) # box = tf.concat([topleft, bottomright], axis=0) # boxes.append(box) # boxes = tf.stack(boxes) # image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes, # box_ind=tf.zeros((track_config['num_scales']), tf.int32), # crop_size=[size_x, size_x]) def pad_frame(im, frame_sz, topleft, bottomright): xleft_pad = tf.maximum(0, -tf.cast(tf.round(topleft[1]), tf.int32)) ytop_pad = tf.maximum(0, -tf.cast(tf.round(topleft[0]), tf.int32)) xright_pad = tf.maximum( 0, tf.cast(tf.round(bottomright[1]), tf.int32) - frame_sz[1]) ybottom_pad = tf.maximum( 0, tf.cast(tf.round(bottomright[0]), tf.int32) - frame_sz[0]) npad = tf.reduce_max( [xleft_pad, ytop_pad, xright_pad, ybottom_pad]) paddings = [[npad, npad], [npad, npad], [0, 0]] im_padded = im im_padded = tf.pad(im_padded, paddings, mode='CONSTANT', constant_values=0) return im_padded, npad def extract_crops(im, npad, topleft, bottomright): # get top-right corner of bbox and consider padding tr_x = npad + tf.cast(tf.round(topleft[1]), tf.int32) # Compute size from rounded co-ords to ensure rectangle lies inside padding. tr_y = npad + tf.cast(tf.round(topleft[0]), tf.int32) width = tf.round(bottomright[1]) - tf.round(topleft[1]) height = tf.round(bottomright[0]) - tf.round(topleft[0]) crop = tf.image.crop_to_bounding_box(im, tf.cast(tr_y, tf.int32), tf.cast(tr_x, tf.int32), tf.cast(height, tf.int32), tf.cast(width, tf.int32)) # crop = tf.image.resize_images(crop, [sz_dst, sz_dst], method=tf.image.ResizeMethod.BILINEAR) # crops = tf.expand_dims(crop, axis=0) return crop image_cropped = [] for factor in search_factors: s_x = factor * base_s_x frame_sz = tf.to_int32(frame_sz[0:2]) topleft = target_yx - get_center(s_x) bottomright = target_yx + get_center(s_x) image_crop, npad = pad_frame(image_minus_avg, frame_sz, topleft, bottomright) image_crop = extract_crops(image_crop, npad, topleft, bottomright) image_crop = tf.image.resize_images( image_crop, [size_x, size_x], method=tf.image.ResizeMethod.BILINEAR) image_cropped.append(image_crop) image_cropped = tf.stack(image_cropped) scale_xs = [] for factor in search_factors: scale_x = base_scale_x / factor scale_xs.append(scale_x) self.scale_xs = tf.stack(scale_xs, name='out_scale_xs') self.debug = image_cropped self.search_images = tf.add(image_cropped, avg_chan, name="out_search_images")
def build_template(self): model_config = self.model_config track_config = self.track_config # Exemplar image lies at the center of the search image in the first frame exemplar_images = get_exemplar_images( self.search_images, [model_config['z_image_size'], model_config['z_image_size']]) templates_s_c5, templates_s_c4, templates_s_c3, templates_a_c5, templates_a_c4, templates_a_c3 = self.get_image_embedding( exemplar_images) # ============================================================================= # templates_s_c5, templates_s_c4, templates_s_c3 = self.get_image_embedding(exemplar_images) # ============================================================================= # ============================================================================= # templates_a_c5, templates_a_c4, templates_a_c3 = self.get_image_embedding(exemplar_images) # ============================================================================= center_scale = int(get_center(track_config['num_scales'])) center_template_s_c5 = tf.identity(templates_s_c5[center_scale]) center_template_s_c4 = tf.identity(templates_s_c4[center_scale]) center_template_s_c3 = tf.identity(templates_s_c3[center_scale]) templates_s_c5 = tf.stack( [center_template_s_c5 for _ in range(track_config['num_scales'])]) templates_s_c4 = tf.stack( [center_template_s_c4 for _ in range(track_config['num_scales'])]) templates_s_c3 = tf.stack( [center_template_s_c3 for _ in range(track_config['num_scales'])]) center_template_a_c5 = tf.identity(templates_a_c5[center_scale]) center_template_a_c4 = tf.identity(templates_a_c4[center_scale]) center_template_a_c3 = tf.identity(templates_a_c3[center_scale]) templates_a_c5 = tf.stack( [center_template_a_c5 for _ in range(track_config['num_scales'])]) templates_a_c4 = tf.stack( [center_template_a_c4 for _ in range(track_config['num_scales'])]) templates_a_c3 = tf.stack( [center_template_a_c3 for _ in range(track_config['num_scales'])]) with tf.variable_scope('target_template'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state_s_c5 = tf.get_variable( 'exemplar_s_c5', initializer=tf.zeros(templates_s_c5.get_shape().as_list(), dtype=templates_s_c5.dtype), trainable=False) state_s_c4 = tf.get_variable( 'exemplar_s_c4', initializer=tf.zeros(templates_s_c4.get_shape().as_list(), dtype=templates_s_c4.dtype), trainable=False) state_s_c3 = tf.get_variable( 'exemplar_s_c3', initializer=tf.zeros(templates_s_c3.get_shape().as_list(), dtype=templates_s_c3.dtype), trainable=False) state_a_c5 = tf.get_variable( 'exemplar_a_c5', initializer=tf.zeros(templates_a_c5.get_shape().as_list(), dtype=templates_a_c5.dtype), trainable=False) state_a_c4 = tf.get_variable( 'exemplar_a_c4', initializer=tf.zeros(templates_a_c4.get_shape().as_list(), dtype=templates_a_c4.dtype), trainable=False) state_a_c3 = tf.get_variable( 'exemplar_a_c3', initializer=tf.zeros(templates_a_c3.get_shape().as_list(), dtype=templates_a_c3.dtype), trainable=False) with tf.control_dependencies([templates_s_c5]): self.init_s_c5 = tf.assign(state_s_c5, templates_s_c5, validate_shape=True) with tf.control_dependencies([templates_s_c4]): self.init_s_c4 = tf.assign(state_s_c4, templates_s_c4, validate_shape=True) with tf.control_dependencies([templates_s_c3]): self.init_s_c3 = tf.assign(state_s_c3, templates_s_c3, validate_shape=True) with tf.control_dependencies([templates_a_c5]): self.init_a_c5 = tf.assign(state_a_c5, templates_a_c5, validate_shape=True) with tf.control_dependencies([templates_a_c4]): self.init_a_c4 = tf.assign(state_a_c4, templates_a_c4, validate_shape=True) with tf.control_dependencies([templates_a_c3]): self.init_a_c3 = tf.assign(state_a_c3, templates_a_c3, validate_shape=True) self.templates_s_c5 = state_s_c5 self.templates_s_c4 = state_s_c4 self.templates_s_c3 = state_s_c3 self.templates_a_c5 = state_a_c5 self.templates_a_c4 = state_a_c4 self.templates_a_c3 = state_a_c3
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config['scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def build_template(self): model_config = self.model_config track_config = self.track_config size_z = model_config['z_image_size'] ratio = self.target_bbox_feed[2] / self.target_bbox_feed[3] # Exemplar image lies at the center of the search image in the first frame search_images = self.search_images shape = search_images.get_shape().as_list() exemplar_images = get_exemplar_images( search_images, [size_z, size_z], tf.tile([[ get_center(track_config['x_image_size']), get_center(track_config['x_image_size']) ]], [shape[0], 1])) center_scale = int(get_center(track_config['num_scales'])) exemplar_images = tf.expand_dims(exemplar_images[center_scale], 0) def background_suppression(embeds, embeds2, ratio): offsets = tf.cond( tf.greater(ratio, 1.5), lambda: [0, 2, 0], lambda: tf.cond(tf.less(ratio, 0.66), lambda: [2, 0, 0], lambda: [1, 1, 0])) embeds = tf.image.resize_image_with_crop_or_pad( embeds, t_shape[1] - offsets[0], t_shape[2] - offsets[1]) embeds = tf.image.resize_image_with_crop_or_pad( embeds, t_shape[1], t_shape[2]) embeds2 = tf.image.resize_image_with_crop_or_pad( embeds2, t_shape2[1] - offsets[0] * 2, t_shape2[2] - offsets[1] * 2) embeds2 = tf.image.resize_image_with_crop_or_pad( embeds2, t_shape2[1], t_shape2[2]) return embeds, embeds2 self.exemplar_images = exemplar_images templates, templates2 = self.get_image_embedding(exemplar_images) t_shape = templates.get_shape().as_list() t_shape2 = templates2.get_shape().as_list() templates, templates2 = tf.map_fn( lambda x: background_suppression(x[0], x[1], x[2]), (templates, templates2, tf.expand_dims(ratio, 0)), dtype=(templates.dtype, templates2.dtype)) with tf.variable_scope('target_template'): # Store template in Variable such that we don't have to feed this template every time. with tf.variable_scope('State'): state = tf.get_variable('exemplar', initializer=tf.zeros( templates.get_shape().as_list(), dtype=templates.dtype), trainable=False) state2 = tf.get_variable('exemplar2', initializer=tf.zeros( templates2.get_shape().as_list(), dtype=templates2.dtype), trainable=False) with tf.control_dependencies([templates, templates2]): self.init = tf.assign(state, templates, validate_shape=True) self.init2 = tf.assign(state2, templates2, validate_shape=True) self.templates = state self.templates2 = state2 # Store Pseudo Templates def _euc_distance(x, z): z = tf.expand_dims(z, 0) return tf.reduce_sum(tf.abs(x - z), -1) n_mem = 5 temp1 = tf.concat([templates for _ in range(n_mem)], 0) temp2 = tf.concat([templates2 for _ in range(n_mem)], 0) temp3 = tf.concat([exemplar_images for _ in range(n_mem)], 0) state_mem1 = tf.get_variable('exemplar_mem', initializer=tf.zeros( temp1.get_shape().as_list(), dtype=temp1.dtype), trainable=False) state_mem2 = tf.get_variable('exemplar2_mem', initializer=tf.zeros( temp2.get_shape().as_list(), dtype=temp2.dtype), trainable=False) image_mem = tf.get_variable('exemplar_image_mem', initializer=tf.zeros( temp3.get_shape().as_list(), dtype=temp3.dtype), trainable=False) with tf.control_dependencies([temp1, temp2, temp3]): self.init_mem = tf.assign(state_mem1, temp1, validate_shape=True) self.init_mem2 = tf.assign(state_mem2, temp2, validate_shape=True) self.init_img_mem = tf.assign(image_mem, temp3, validate_shape=True) up_mem = tf.scatter_update(state_mem1, self.mem_id_feed, templates[0]) up_mem2 = tf.scatter_update(state_mem2, self.mem_id_feed, templates2[0]) up_img_mem = tf.scatter_update(image_mem, self.mem_id_feed, exemplar_images[0]) with tf.control_dependencies([up_mem, up_mem2, up_img_mem]): self.up_mem = up_mem self.up_mem2 = up_mem2 self.up_img_mem = up_img_mem state_pseu = [] state_pseu2 = [] image_pseu = [] self.init_pseu = [] self.init2_pseu = [] self.init_pseu_img = [] for i in range(3): state_pseu.append( tf.get_variable('exemplar_pseu' + str(i), initializer=tf.zeros( templates.get_shape().as_list(), dtype=templates.dtype), trainable=False)) state_pseu2.append( tf.get_variable('exemplar2_pseu' + str(i), initializer=tf.zeros( templates2.get_shape().as_list(), dtype=templates2.dtype), trainable=False)) image_pseu.append( tf.get_variable( 'exemplar_pseu_image' + str(i), initializer=tf.zeros( exemplar_images.get_shape().as_list(), dtype=exemplar_images.dtype), trainable=False)) with tf.control_dependencies( [templates, templates2, exemplar_images]): self.init_pseu.append( tf.assign(state_pseu[i], templates, validate_shape=True)) self.init2_pseu.append( tf.assign(state_pseu2[i], templates2, validate_shape=True)) self.init_pseu_img.append( tf.assign(image_pseu[i], exemplar_images, validate_shape=True)) self.image_pseu = image_pseu self.pseu_temp = state_pseu self.pseu_temp2 = state_pseu2 state_pseus = tf.concat([self.templates] + state_pseu + [state_mem1], 0) sp_shape = state_pseus.get_shape().as_list()[0] state_pseus_c = tf.reshape(state_pseus, [sp_shape, -1]) state_pseus_dis = tf.map_fn( lambda x: _euc_distance(state_pseus_c, x), state_pseus_c, dtype=state_pseus_c.dtype) state_pseus_dis = tf.reshape(state_pseus_dis, [sp_shape, sp_shape])[1:, :] state_pseus_dis = tf.reduce_sum(state_pseus_dis, -1) self.state_pseus_dis = state_pseus_dis _, state_pseus_idx = tf.nn.top_k(state_pseus_dis, k=len(state_pseu)) image_pseu_extra = tf.concat(image_pseu + [image_mem], 0) state_pseus2 = tf.concat(state_pseu2 + [state_mem2], 0) self.up_img = [] self.up_pseu = [] self.up2_pseu = [] for i in range(len(state_pseu)): with tf.control_dependencies([ state_pseus_idx, image_pseu_extra, state_pseus, state_pseus2 ]): self.up_pseu.append( tf.assign(state_pseu[i], tf.expand_dims( state_pseus[state_pseus_idx[i] + 1], 0), validate_shape=True)) self.up2_pseu.append( tf.assign(state_pseu2[i], tf.expand_dims( state_pseus2[state_pseus_idx[i]], 0), validate_shape=True)) self.up_img.append( tf.assign(image_pseu[i], tf.expand_dims( image_pseu_extra[state_pseus_idx[i]], 0), validate_shape=True))
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') smooth_rate = self.track_config['smooth'] update_interval = self.track_config['update_interval'] feature_balance = self.track_config['feature_balance'] # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) examplar = self.siamese_model.get_examplar(sess, input_feed) examplar_smooth = examplar st_template = [] for i in range(self.siamese_model.train_config['time_range']): st_template.append(examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step(sess, st_template_np) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Set padding for refining search region img = mpimg.imread(frames[0]) context_amount = self.track_config['context_amount'] size_z = self.model_config['z_image_size'] size_x = self.track_config['x_image_size'] padding_h = 10 padding_w = 10 if original_target_height / original_target_width > 2: #2 padding_h = 1.4 #1.4 padding_w = 6 # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response2 = outputs['response2'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response2, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] response2 = response2[best_scale] response = feature_balance * response + ( 1 - feature_balance) * response2 with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Refine the response base_z_size = np.array([ current_target_state.bbox.height, current_target_state.bbox.width ]) base_z_context_size = base_z_size + context_amount * np.sum( base_z_size) base_s_z = np.sqrt( np.prod(base_z_context_size)) # Canonical size base_scale_z = size_z / base_s_z d_search = (size_x - size_z) / 2.0 base_pad = d_search / base_scale_z base_s_x = base_s_z + 2 * base_pad if base_s_x / current_target_state.bbox.height > padding_h: start_h = np.ceil( response_size * (base_s_x - current_target_state.bbox.height * padding_h) / (2 * base_s_x)) end_h = np.floor(response_size - start_h) start_h = np.int(start_h) end_h = np.int(end_h) response[0:start_h, :] = 0 response[end_h:-1, :] = 0 if base_s_x / current_target_state.bbox.width > padding_w: start_w = np.ceil( response_size * (base_s_x - current_target_state.bbox.width * padding_w) / (2 * base_s_x)) end_w = np.floor(response_size - start_w) start_w = np.int(start_w) end_w = np.int(end_w) response[:, :start_w] = 0 response[:, end_w:] = 0 # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input # Update the spatial-temporal template using gcn if i % update_interval == 0: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] current_examplar = self.siamese_model.get_examplar( sess, input_feed) # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:] examplar_smooth = current_examplar current_examplar = smooth_rate * examplar_smooth + ( 1 - smooth_rate) * examplar st_template.pop(1) st_template.append(current_examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step( sess, st_template_np) assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) if np.max(response_max) < 0: logging.warning('MAX_RESPONSE LESS THAN ZERO!') # best_scale = current_scale_idx else: best_scale = 0 response = response[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [ frames[0], bbox_feed, self.x_image_size, self.search_factors ] frame2crop_scale, image_z = self.siamese_model.initialize( sess, input_feed) imwrite(osp.join(logdir, 'aimagez.jpg'), cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR)) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] image_c = None x_image_size = self.x_image_size lost = 0 moved2border = False conf_thresh = 0.2 # 0.2 bound_thresh = 0.2 # 0.2 sup_thresh = 0.15 # 0.15 prev_score = conf_thresh + 0.01 upsample_factor = self.track_config['upsample_factor'] search_factors = self.search_factors for i, filename in enumerate(frames): if i > 0 or include_first: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] if prev_score > bound_thresh: lost = 0 else: lost += 1 if prev_score > 0.9: self.siamese_model.update(sess, [ frames[i - 1], bbox_feed, self.x_image_size, search_factors ]) with open(filename, 'rb') as f: wi, hi = GetWidthAndHeight(f) t_i_ratio = max([ current_target_state.bbox.height / hi, current_target_state.bbox.width / wi ]) if prev_score < conf_thresh: x_image_size += 100 #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init) if t_i_ratio < 0.05: x_image_size = min(x_image_size, 555) elif t_i_ratio < 0.25: x_image_size = min(x_image_size, 455) elif t_i_ratio > 0.5: x_image_size = min(x_image_size, 255) else: x_image_size = min(x_image_size, 355) else: x_image_size = self.x_image_size if i > 1: top = (current_target_state.bbox.y - (current_target_state.bbox.height / 2) < 10) left = (current_target_state.bbox.x - (current_target_state.bbox.width / 2) < 10) bottom = (current_target_state.bbox.y + (current_target_state.bbox.height / 2) > hi - 10) right = (current_target_state.bbox.x + (current_target_state.bbox.width / 2) > wi - 10) bound_flag = top or left or bottom or right #if top or left or bottom or right: #if not prev_score < bound_thresh: #moved2border = True #if not moved2border: #current_target_state.bbox = Rectangle(wi / 2, hi / 2, #current_target_state.bbox.width, #current_target_state.bbox.height) #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, #current_target_state.bbox.height, current_target_state.bbox.width] #else: #if not prev_score < bound_thresh: #moved2border = False if lost > 5 and bound_flag: lost = 0 diffy = hi * 0.5 - bbox_feed[0] diffx = wi * 0.5 - bbox_feed[1] bbox_feed = [ diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3] ] current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0], bbox_feed[3], bbox_feed[2]) input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5: x_image_sizeb4 = x_image_size x_image_size += 100 #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init if t_i_ratio < 0.05: x_image_size_l = 555 elif t_i_ratio < 0.25: x_image_size_l = 455 elif t_i_ratio > 0.5: x_image_size_l = 255 else: x_image_size_l = 355 if not x_image_size > x_image_size_l: input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) else: x_image_size = x_image_sizeb4 # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response * (re_out > sup_thresh), axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties if max(response_penalized) == 0.: best_scale = 1 else: best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] re_out = re_out[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) response = response * (re_out > sup_thresh) window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window if np.max(re_out) < sup_thresh: r_max, c_max = response.shape r_max, c_max = int(r_max / 2), int(c_max / 2) disp_instance_input = [0, 0] disp_instance_frame = [0, 0] else: # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] y = np.round(y) x = np.round(x) prev_score = re_out[r_max, c_max] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) # Some book keeping search_center = np.array( [get_center(x_image_size), get_center(x_image_size)]) height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') # Add score colormap image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) #im_shape = image_cropped.shape #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride'] #pad = int((im_shape[0] - re_shape) / 2) #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad]) #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant') #response_crop = response_crop / response_crop.max() #response_crop = np.uint8(response_crop * 255) #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB) #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0) xmin = bbox_search.x.astype(np.int32) ymin = bbox_search.y.astype(np.int32) xmax = xmin + bbox_search.width.astype(np.int32) ymax = ymin + bbox_search.height.astype(np.int32) cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2) text = str(prev_score) cv2.putText(image_cropped, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), lineType=cv2.LINE_AA) imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) #if image_c is not None: #his_dir = logdir + '_his' #if not osp.exists(his_dir): #os.mkdir(his_dir) #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0] #image_c_p = np.uint8(image_c_p) #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)), #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR)) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track_vot(self, sess, frame): bbox_feed = [ self.vot_current_target_state.bbox.y, self.vot_current_target_state.bbox.x, self.vot_current_target_state.bbox.height, self.vot_current_target_state.bbox.width ] input_feed = [frame, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response_s_c5 = outputs['response_s_c5'] response_s_c4 = outputs['response_s_c4'] response_s_c3 = outputs['response_s_c3'] response_a_c5 = outputs['response_a_c5'] response_a_c4 = outputs['response_a_c4'] response_a_c3 = outputs['response_a_c3'] response_size = response_s_c5.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_a_c5_max = np.max(response_a_c5) response_a_c4_max = np.max(response_a_c4) response_a_c3_max = np.max(response_a_c3) response_a_c5 = response_a_c5 / response_a_c5_max response_a_c4 = response_a_c4 / response_a_c4_max response_a_c3 = response_a_c3 / response_a_c3_max response_s_all = 0.7 * response_s_c5 + 0.3 * response_s_c4 + 0.1 * response_s_c3 response_a_all = 0.3 * response_a_c5 + 0.6 * response_a_c4 + 0.1 * response_a_c3 response_s_all_max = np.max(response_s_all) response_s_all = response_s_all / response_s_all_max response_a_all_max = np.max(response_a_all) response_a_all = response_a_all / response_a_all_max response = 0.3 * response_s_all + 0.7 * response_a_all response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: ## TODO combine siamfc and alexnet best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * 8 #self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = self.vot_current_target_state.bbox.y x = self.vot_current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = self.vot_current_target_state.bbox.height / self.vot_original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = self.vot_original_target_height * target_scale width = self.vot_original_target_width * target_scale self.vot_current_target_state.bbox = Rectangle(x, y, width, height) self.vot_current_target_state.scale_idx = best_scale self.vot_current_target_state.search_pos = self.vot_search_center + disp_instance_input reported_bbox = convert_bbox_format(self.vot_current_target_state.bbox, 'top-left-based') return reported_bbox
def track(self, sess, frame, logdir='/tmp'): """Runs tracking on a single image.""" i = self.i = self.i + 1 current_target_state = self.current_target_state original_target_height = self.original_target_height original_target_width = self.original_target_width search_center = self.search_center mem_count = self.mem_count moved2border = self.moved2border update_delay = self.update_delay + 1 lost = self.lost + 1 image_c = self.image_c x_image_size = self.x_image_size search_factors = self.search_factors_init conf_thresh = self.conf_thresh bound_thresh = self.bound_thresh sup_thresh = self.sup_thresh prev_score = self.prev_score hi, wi, _ = frame.shape h_ratio = current_target_state.bbox.height / hi w_ratio = current_target_state.bbox.width / wi t_i_ratio = max([h_ratio, w_ratio]) if prev_score < conf_thresh: x_image_size += 100 #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init) if t_i_ratio < 0.05: x_image_size = min(x_image_size, 555) elif t_i_ratio > 0.6: x_image_size = min(x_image_size, 255) elif t_i_ratio > 0.4: x_image_size = min(x_image_size, 355) else: x_image_size = min(x_image_size, 455) else: x_image_size = self.x_image_size_init num_scales = len(search_factors) bbx = current_target_state.bbox.x bby = current_target_state.bbox.y bbw = current_target_state.bbox.width bbh = current_target_state.bbox.height bbox_feed = [bby, bbx, bbh, bbw] if i > 1: top = (current_target_state.bbox.y - (current_target_state.bbox.height / 2) < 10) left = (current_target_state.bbox.x - (current_target_state.bbox.width / 2) < 10) bottom = (current_target_state.bbox.y + (current_target_state.bbox.height / 2) > hi - 10) right = (current_target_state.bbox.x + (current_target_state.bbox.width / 2) > wi - 10) if top or left or bottom or right: if not prev_score < bound_thresh: moved2border = True if not moved2border: current_target_state.bbox = Rectangle( wi / 2, hi / 2, current_target_state.bbox.width, current_target_state.bbox.height) bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] else: if not prev_score < bound_thresh: moved2border = False if t_i_ratio < 0.3 and lost > 5: lost = 0 diffy = hi * 0.5 - bbox_feed[0] diffx = wi * 0.5 - bbox_feed[1] bbox_feed = [ diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3] ] current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0], bbox_feed[3], bbox_feed[2]) input_feed = [frame, bbox_feed, x_image_size, search_factors] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) if np.max(re_out) < conf_thresh: x_image_sizeb4 = x_image_size x_image_size += 100 #x_image_size_l = ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init if t_i_ratio < 0.05: x_image_size_l = 555 elif t_i_ratio > 0.6: x_image_size_l = 255 elif t_i_ratio > 0.4: x_image_size_l = 355 else: x_image_size_l = 455 if not x_image_size > x_image_size_l: input_feed = [frame, bbox_feed, x_image_size, search_factors] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) else: x_image_size = x_image_sizeb4 # Choose the scale whole response map has the highest peak if num_scales > 1: response_max = np.max(response * (re_out > sup_thresh), axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (num_scales)) current_scale_idx = int(get_center(num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties if max(response_penalized) == 0.: best_scale = 1 else: best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] re_out = re_out[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) response = response * (re_out > sup_thresh) window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) prev_score = re_out[r_max, c_max] # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] y = np.round(y) x = np.round(x) # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) # Some book keeping search_center = np.array( [get_center(x_image_size), get_center(x_image_size)]) height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype( np.uint8) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') xmin = bbox_search.x.astype(np.int32) ymin = bbox_search.y.astype(np.int32) xmax = xmin + bbox_search.width.astype(np.int32) ymax = ymin + bbox_search.height.astype(np.int32) cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2) text = str(prev_score) cv2.putText(image_cropped, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), lineType=cv2.LINE_AA) imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) if prev_score > self.store_thresh: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] self.siamese_model.update_mem(sess, [ frame, bbox_feed, self.x_image_size_init, self.search_factors_init, mem_count ]) mem_count += 1 if mem_count > 4 or (mem_count > 0 and update_delay > 5): self.siamese_model.update(sess) mem_count = 0 update_delay = 0 if prev_score > bound_thresh: lost = 0 self.mem_count = mem_count self.update_delay = update_delay self.moved2border = moved2border self.lost = lost self.x_image_size = x_image_size self.prev_score = prev_score self.current_target_state = current_target_state reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') #return prev_score>0.4, reported_bbox, prev_score return prev_score > 0.4, reported_bbox
def track(self, sess, frame): bbox_feed = [ self.current_target_state.bbox.y, self.current_target_state.bbox.x, self.current_target_state.bbox.height, self.current_target_state.bbox.width ] input_feed = [frame, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = self.current_target_state.bbox.y x = self.current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = self.current_target_state.bbox.height / self.original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = self.original_target_height * target_scale width = self.original_target_width * target_scale self.current_target_state.bbox = Rectangle(x, y, width, height) self.current_target_state.scale_idx = best_scale self.current_target_state.search_pos = self.search_center + disp_instance_input assert 0 <= self.current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= self.current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' reported_bbox = convert_bbox_format(self.current_target_state.bbox, 'top-left-based') self.frame_cnt += 1 if self.log_level > 0: np.save(osp.join(self.logdir, 'num_frames.npy'), [self.frame_cnt]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype( np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor cv2.imwrite( osp.join(self.logdir, 'image_cropped{}.jpg'.format(self.frame_cnt)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) cv2.imwrite( osp.join(self.logdir, 'image_origin{}.jpg'.format(self.frame_cnt)), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) np.save( osp.join(self.logdir, 'best_scale{}.npy'.format(self.frame_cnt)), [best_scale]) np.save( osp.join(self.logdir, 'response{}.npy'.format(self.frame_cnt)), response) y_search, x_search = self.current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(self.logdir, 'bbox{}.npy'.format(self.frame_cnt)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) with open(osp.join(self.logdir, 'track_rect.txt'), 'a') as f: rect_str = '{},{},{},{}\n'.format(int(reported_bbox[0]), int(reported_bbox[1]), int(reported_bbox[2]), int(reported_bbox[3])) f.write(rect_str) return reported_bbox
def build_detection(self): track_config = self.track_config [self.embeds_ini, self.embeds] = self.get_image_embedding(self.search_images, reuse=True) center_scale = int(get_center(track_config['num_scales'])) new_template = tf.identity(self.templates[center_scale]) x_size = self.embeds.get_shape().as_list() hw_size = x_size[2] c_size = x_size[3] z_size = new_template.get_shape().as_list() temp_size = z_size[1] temp_c_size = z_size[-1] final_temp_c_size = self.gcn_config['g2_output'] with tf.variable_scope('instance_gcn_layer', 'instance_gcn_layer', reuse=tf.AUTO_REUSE): with slim.arg_scope([slim.conv2d, slim.max_pool2d], padding='VALID'): x_region_merged = tf.nn.relu(self.embeds) x_region_merged = slim.conv2d( x_region_merged, temp_c_size, [3, 3], 1, scope='conv_att', padding='SAME', ) x_region_merged = slim.max_pool2d(x_region_merged, [hw_size, hw_size], 1) x_region_merged = tf.reshape(x_region_merged, [-1, c_size, 1, 1]) x_region_merged = slim.conv2d(x_region_merged, temp_size * temp_size, [1, 1], 1, scope='conv_s') x_region_merged = tf.identity(x_region_merged[center_scale]) x_region_merged = tf.transpose(x_region_merged, perm=[2, 0, 1]) x_region_merged = tf.reshape(x_region_merged, [temp_size, temp_size, c_size]) z_merged = tf.add(new_template, x_region_merged) z_merged = tf.expand_dims(z_merged, 0) support_att = attention(z_merged, c_size) self.support_att = support_att new_template = tf.reshape(new_template, [temp_size * temp_size, temp_c_size]) new_template = gcn_tracking2(gcn_config=self.gcn_config, inputs=tf.squeeze(new_template), supports=support_att) new_template = tf.reshape( new_template, [temp_size, temp_size, final_temp_c_size]) self.templates_final = tf.stack( [new_template for _ in range(track_config['num_scales'])]) with tf.variable_scope('detection'): def _translation_match(x, z): x = tf.expand_dims( x, 0) # [batch, in_height, in_width, in_channels] z = tf.expand_dims( z, -1 ) # [filter_height, filter_width, in_channels, out_channels] return tf.nn.conv2d(x, z, strides=[1, 1, 1, 1], padding='VALID', name='translation_match') output = tf.map_fn( lambda x: _translation_match(x[0], x[1]), (self.embeds, self.templates_final), dtype=self.embeds.dtype) # of shape [3, 1, 17, 17, 1] output = tf.squeeze(output, [1, 4]) # of shape e.g. [3, 17, 17] bias = tf.get_variable('biases', [1], dtype=tf.float32, initializer=tf.constant_initializer( 0.0, dtype=tf.float32), trainable=False) response = self.model_config['adjust_response_config'][ 'scale'] * output + bias self.response = response
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') print(frames) # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] output_json={} #dump all bboxes in this output file for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] #print(response) with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence) * response + window_influence * self.window # Find maximum response srtd=response.argsort(axis=None) v = response.argmax() r_max, c_max = np.unravel_index(v, response.shape) if not osp.exists(osp.join(logdir,"Intermediate")): os.mkdir(osp.join(logdir,"Intermediate")) to_save = np.interp(response,(response.min(),response.max()),(0,255)) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save) to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1) ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1) image = np.uint8(thresh1.copy()) cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB) image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image) centres=[] for c in cnts: M = cv2.moments(c) cX = int(M["m10"] / M["m00"]) cY = int(M["m01"] / M["m00"]) centres.append((cY,cX,False)) centres.append((r_max,c_max,True)) #print(centres) #cts_copy = copy(current_target_state) #cts_copy2 = copy(current_target_state) output_json[filename]=[] for (r_max,c_max,to_deep_copy) in centres: if to_deep_copy: cts_copy = deepcopy(current_target_state) else: cts_copy = copy(current_target_state) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = cts_copy.bbox.y x = cts_copy.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = cts_copy.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config['scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale cts_copy.bbox = Rectangle(x, y, width, height) cts_copy.scale_idx = best_scale cts_copy.search_pos = search_center + disp_instance_input assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0 and to_deep_copy: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = cts_copy.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based') #print(f"reported bbox {reported_bbox}") if to_deep_copy: reported_bboxs.append(reported_bbox) else: rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1, reported_bbox.width, reported_bbox.height) arr = output_json[filename] arr.append(rect_str) with open(osp.join(logdir,'bboxes.json'),'w') as f: json.dump(output_json,f,indent=4) return reported_bboxs