def _clip_rect_size(rect, min_size=None, max_size=None, name='clip_rect_size'): with tf.name_scope(name) as scope: center, size = geom.rect_center_size(rect) if max_size is not None: size = tf.minimum(size, max_size) if min_size is not None: size = tf.maximum(size, min_size) return geom.make_rect_center_size(center, size)
def _rect_translate_scale(rect, translate, scale, name='rect_translate_scale'): ''' Args: rect: [..., 4] translate: [..., 2] scale: [..., 1] ''' with tf.name_scope(name) as scope: center, size = geom.rect_center_size(rect) return geom.make_rect_center_size(center + translate, size * scale)
def modify_aspect_ratio(rect, method='stretch', axis=-1, eps=1e-3, name='modify_aspect_ratio'): if method == 'stretch': return rect # No change. with tf.name_scope(name) as scope: center, size = geom.rect_center_size(rect) with tf.control_dependencies([tf.assert_non_negative(size)]): size = tf.identity(size) size = tf.maximum(size, eps) width = scalar_size(size, method, axis=axis, keepdims=True) return geom.make_rect_center_size(center, width)
def next(self, features, labels, state, name=None, reset_position=False): ''' Args: reset_position: Keep the appearance model but reset the position. If this is true, then features['rect'] must be present. ''' with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope: im = features['image']['data'] run_opts = state['run_opts'] aspect = state['aspect'] prev_im = state['image'] mean_color = state['mean_color'] # If the label is not valid, there will be no loss for this frame. # However, the input image may still be processed. # In this case, adopt the previous rectangle as the "ground-truth". if self.mode in MODE_KEYS_SUPERVISED: gt_rect = tf.where(labels['valid'], labels['rect'], state['rect']) else: gt_rect = None # Use the previous rectangle. # This will be the ground-truth rect during training if `use_predictions` is false. prev_target_rect = state['rect'] # Coerce the aspect ratio of the rectangle to construct the search area. # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale) base_rect = model_util.coerce_aspect( prev_target_rect, aspect, aspect_method=self.aspect_method) # Apply perturbation to aspect-coerced "previous" rect (may be current gt). if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN: base_rect = tf.cond( run_opts['is_training'], lambda: siamfc.perturb(base_rect, **self.perturb_params), lambda: base_rect) search_rect = geom.grow_rect(self.search_scale, base_rect) # Coerce the aspect ratio of the rectangle to construct the context area. # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale) context_rect = geom.grow_rect(self.context_scale, base_rect) # Extract same rectangle in past and current images and feed into conv-net. context_curr = self._crop(im, context_rect, self.context_size, mean_color) context_prev = self._crop(prev_im, context_rect, self.context_size, mean_color) with tf.name_scope('summary_context'): tf.summary.image('curr', context_curr) tf.summary.image('prev', context_curr) motion = [context_curr ] if self.stateless else [context_curr, context_prev] motion = tf.stack(motion, axis=1) # How to obtain template from previous state? template_feat = state['template_init'] # Extract an image pyramid (use 1 scale when not in tracking mode). mid_scale = (self.num_scales - 1) // 2 if self.num_scales == 1: scales = tf.constant([1.0], dtype=tf.float32) else: scales = model_util.scale_range( tf.constant(self.num_scales), tf.to_float(self.log_scale_step)) search_ims, search_rects = self._crop_pyr(im, search_rect, self.search_size, scales, mean_color) with tf.name_scope('summary'): _image_sequence_summary('search', search_ims, elem_name='scale') with tf.variable_scope('appearance', reuse=False) as appearance_scope: # Extract features, perform search, get receptive field of response wrt image. search_input = self._preproc(search_ims) search_input = cnn.as_tensor(search_input, add_to_set=True) with tf.variable_scope('embed', reuse=True): search_feat, search_layers, _ = self._embed_net( search_input, (False if not self.learn_appearance else run_opts['is_training'])) rf_search = search_feat.fields[search_input.value] search_feat_size = search_feat.value.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, search_feat_size, rf_search) with tf.variable_scope('join', reuse=(self._num_frames >= 1)): join_fn = join_nets.BY_NAME[self.join_arch] if self.join_type == 'single': response = join_fn( template_feat, search_feat, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) elif self.join_type == 'multi': response = join_fn( template_feat, search_feat, self.multi_join_layers, template_layers, search_layers, search_input, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) else: raise ValueError('unknown join type: "{}"'.format( self.join_type)) rf_response = response.fields[search_input.value] response = cnn.get_value(response) response_size = response.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, response_size, rf_response) response = tf.verify_tensor_all_finite( response, 'output of xcorr is not finite') if self._num_frames == 0: # Define appearance model saver. if self.appearance_model_file: # Create the graph ops for the saver. var_list = appearance_scope.global_variables() var_list = {var.op.name: var for var in var_list} if self.appearance_scope_dst or self.appearance_scope_src: # Replace 'dst' with 'src'. # Caution: This string replacement is a little dangerous. var_list = { k.replace(self.appearance_scope_dst, self.appearance_scope_src, 1): v for k, v in var_list.items() } self._appearance_var_list = var_list self._appearance_saver = tf.train.Saver(var_list) # Post-process scores. with tf.variable_scope('output', reuse=(self._num_frames > 0)): if not self.learn_appearance: # TODO: Prevent batch-norm updates as well. # TODO: Set trainable=False for all variables above. response = tf.stop_gradient(response) # Regress response to translation and log(scale). output_shapes = {'translation': [2], 'log_scale': [1]} outputs = _output_net(response, motion, output_shapes, run_opts['is_training'], weight_decay=self.wd, use_response=self.output_use_response, use_images=self.output_use_images) _image_sequence_summary('response', model_util.colormap( tf.sigmoid(response), _COLORMAP), elem_name='scale') losses = {} if self.mode in MODE_KEYS_SUPERVISED: # Get ground-truth translation and scale relative to search window. gt_rect_in_search = geom.crop_rect(gt_rect, search_rect) gt_position, gt_rect_size = geom.rect_center_size( gt_rect_in_search) # Positions in real interval [0, 1] correspond to real interval [0, search_size]. # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size]. gt_translation = gt_position - 0.5 # Displacement relative to center. gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method) target_size_in_search = self.target_size / self.search_size # size = target_size * scale gt_scale = gt_size / target_size_in_search gt_log_scale = tf.log(gt_scale) if self.appearance_loss: target_size_in_response = self.target_size / rf_response.stride loss_name, loss = siamfc.compute_loss( response[:, mid_scale], target_size_in_response, **self.appearance_loss_params) losses[loss_name] = loss loss_name, loss = regress.compute_loss_vector( outputs['translation'], outputs['log_scale'], gt_translation, gt_log_scale, **self.loss_params) losses[loss_name] = loss if reset_position: # TODO: Something better! # TODO: Keep appearance loss even when `reset_position` is true? losses = {k: tf.zeros_like(v) for k, v in losses.items()} translation = outputs['translation'] # [b, 2] scale = tf.exp(outputs['log_scale']) # [b, 1] # Damp the scale update towards 1 (no change). # TODO: Should this be in log space? scale = self.scale_update_rate * scale + ( 1. - self.scale_update_rate) * 1. # Get rectangle in search image. prev_target_in_search = geom.crop_rect(prev_target_rect, search_rect) pred_in_search = _rect_translate_scale(prev_target_in_search, translation, scale) # Move from search back to original image. pred = geom.crop_rect(pred_in_search, geom.crop_inverse(search_rect)) # Limit size of object. pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0) # Rectangle to use in next frame for search area. # If using gt and rect not valid, use previous. if self.mode in MODE_KEYS_SUPERVISED: next_prev_rect = pred if self.use_predictions else gt_rect else: next_prev_rect = pred # outputs = {'rect': pred, 'score': confidence} outputs = {'rect': pred} state = { 'run_opts': run_opts, 'aspect': aspect, 'image': im, 'rect': next_prev_rect, 'template_init': state['template_init'], 'mean_color': state['mean_color'], } self._num_frames += 1 return outputs, state, losses
def next(self, features, labels, state, name='timestep'): ''' Args: reset_position: Keep the appearance model but reset the position. If this is true, then features['rect'] must be present. ''' with tf.name_scope(name) as scope: im = features['image']['data'] run_opts = state['run_opts'] aspect = state['aspect'] mean_color = state['mean_color'] prev_im = state['image'] # If the label is not valid, there will be no loss for this frame. # However, the input image may still be processed. # In this case, adopt the previous rectangle as the "ground-truth". if self.mode in MODE_KEYS_SUPERVISED: gt_rect = tf.where(labels['valid'], labels['rect'], state['rect']) else: gt_rect = None # Use the previous rectangle. # This will be the ground-truth rect during training if `use_predictions` is false. prev_target_rect = state['rect'] # Coerce the aspect ratio of the rectangle to construct the context area. context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale) # Extract same rectangle in past and current images and feed into conv-net. context_curr = self._crop(im, context_rect, CONTEXT_SIZE, mean_color) context_prev = self._crop(prev_im, context_rect, CONTEXT_SIZE, mean_color) with tf.name_scope('summary_context'): tf.summary.image('curr', context_curr) tf.summary.image('prev', context_curr) ims = [context_curr ] if self.stateless else [context_curr, context_prev] ims = tf.stack(ims, axis=1) if self.output_form == 'discrete': output_shapes = { 'response': [ self.num_scales, self.response_size, self.response_size, 1 ] } elif self.output_form == 'vector': output_shapes = {'translation': [2], 'log_scale': [1]} else: raise ValueError( 'unknown output form: "{}"'.format(output_form)) # Extract features, perform search, get receptive field of response wrt image. ims_preproc = self._preproc(ims) with tf.variable_scope('motion', reuse=(self._num_frames > 0)): outputs = _motion_net(ims_preproc, output_shapes, run_opts['is_training'], weight_decay=self.wd) outputs = { k: tf.verify_tensor_all_finite(v, 'output "{}" not finite'.format(k)) for k, v in outputs.items() } losses = {} if self.mode in MODE_KEYS_SUPERVISED: # Get ground-truth translation and scale relative to context window. gt_rect_in_context = geom.crop_rect(gt_rect, context_rect) gt_position, gt_rect_size = geom.rect_center_size( gt_rect_in_context) gt_translation = gt_position - 0.5 # Displacement relative to center. gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method) # Scale is size relative to target_size. gt_scale = gt_size / (self.target_size / CONTEXT_SIZE) gt_log_scale = tf.log(gt_scale) if self.output_form == 'discrete': # base_translations = ((self.response_stride / self.context_size) * # util.displacement_from_center(self.response_size)) # scales = util.scale_range(tf.constant(self.num_scales), # tf.to_float(self.log_scale_step)) base_target_size = self.target_size / CONTEXT_SIZE translation_stride = self.response_stride / CONTEXT_SIZE loss_name, loss = compute_loss_discrete( outputs['response'], self.num_scales, translation_stride, self.log_scale_step, base_target_size, gt_translation, gt_size, **self.loss_params) else: loss_name, loss = compute_loss_vector( outputs['translation'], outputs['log_scale'], gt_translation, gt_log_scale, **self.loss_params) # if reset_position: # # TODO: Something better! # losses[loss_name] = tf.zeros_like(loss) # else: # losses[loss_name] = loss losses[loss_name] = loss if self.output_form == 'discrete': response = outputs['response'] scales = util.scale_range(tf.constant(self.num_scales), tf.to_float(self.log_scale_step)) # Use pyramid from loss function to obtain position. # Get relative translation and scale from response. # TODO: Upsample to higher resolution than original image? response_resize = cnn.get_value( cnn.upsample(response, self.response_stride, method=tf.image.ResizeMethod.BICUBIC)) response_final = response_resize # if self.learn_motion: # response_final = response_resize # else: # response_final = apply_motion_penalty( # response_resize, radius=self.window_radius * self.target_size, # **self.window_params) translation, scale, in_arg_max = util.find_peak_pyr( response_final, scales, eps_abs=self.arg_max_eps) scale = tf.expand_dims(scale, -1) # [b, 1] # Obtain translation in relative co-ordinates within search image. translation = 1 / tf.to_float(CONTEXT_SIZE) * translation # Get scalar representing confidence in prediction. # Use raw appearance score (before motion penalty). confidence = helpers.weighted_mean(response_resize, in_arg_max, axis=(-4, -3, -2)) else: translation = outputs['translation'] # [b, 2] scale = tf.exp(outputs['log_scale']) # [b, 1] # Damp the scale update towards 1 (no change). # TODO: Should this be in log space? scale = self.scale_update_rate * scale + ( 1. - self.scale_update_rate) * 1. # Get rectangle in search image. prev_target_in_context = geom.crop_rect(prev_target_rect, context_rect) pred_in_context = _rect_translate_scale(prev_target_in_context, translation, scale) # Move from search back to original image. pred = geom.crop_rect(pred_in_context, geom.crop_inverse(context_rect)) # Limit size of object. pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0) # Rectangle to use in next frame for search area. # If using gt and rect not valid, use previous. if self.mode in MODE_KEYS_SUPERVISED: next_prev_rect = pred if self.use_predictions else gt_rect else: next_prev_rect = pred self._num_frames += 1 # outputs = {'rect': pred, 'score': confidence} predictions = {'rect': pred} state = { 'run_opts': run_opts, 'aspect': aspect, # 'image': tf.image.resize_images(im, [self.image_size, self.image_size]), 'image': im, 'rect': next_prev_rect, 'mean_color': state['mean_color'], } return predictions, state, losses