def abs_diff(template, search, is_training, trainable=True, use_pre_conv=True, pre_conv_output_dim=256, reduce_channels=True, use_mean=False, use_batch_norm=False, scope='abs_diff'): ''' Requires that template is 1x1. Args: template: [b, ht, wt, c] search: [b, s, hs, ws, c] ''' with tf.variable_scope(scope, 'abs_diff'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) if use_pre_conv: # Reduce template to 1x1. kernel_size = template.value.shape[-3:-1].as_list() def pre_conv(x): x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x) x = cnn.pixelwise(tf.nn.relu, x) x, restore = cnn.merge_batch_dims(x) x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size, padding='VALID', activation_fn=None, normalizer_fn=slim.batch_norm, normalizer_params=dict(is_training=is_training), scope='conv') x = restore(x) return x # Perform pre-activation because the output layer did not have activations. with tf.variable_scope('pre_conv', reuse=False): template = pre_conv(template) with tf.variable_scope('pre_conv', reuse=True): search = pre_conv(search) template = cnn.get_value(template) template_size = template.shape[-3:-1].as_list() if template_size != [1, 1]: raise ValueError('template shape is not [1, 1]: {}'.format(template_size)) # Use broadcasting to perform element-wise operation. template = tf.expand_dims(template, 1) delta = cnn.pixelwise(lambda x: tf.abs(x - template), search) if reduce_channels: delta = cnn.channel_sum(delta) if use_mean: num_channels = template.shape[-1].value delta = cnn.pixelwise(lambda x: (1 / tf.to_float(num_channels)) * x, delta) # TODO: No bias if attaching more layers? return _calibrate(delta, is_training, use_batch_norm, learn_gain=False, gain_init=1, trainable=trainable)
def concat_fc(template, search, is_training, trainable=True, join_dim=128, mlp_num_outputs=1, mlp_num_layers=2, mlp_num_hidden=128, mlp_kwargs=None, scope=None): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' with tf.variable_scope(scope, 'concat_fc'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) # Instead of sliding-window concat, we do separate conv and sum the results. # Disable activation and normalizer. Perform these after the sum. kernel_size = template.value.shape[-3:-1].as_list() conv_kwargs = dict( padding='VALID', activation_fn=None, normalizer_fn=None, biases_initializer=None, # Disable bias because bnorm is performed later. ) with tf.variable_scope('template'): template = cnn.slim_conv2d(template, join_dim, kernel_size, scope='fc', **conv_kwargs) with tf.variable_scope('search'): search, restore = cnn.merge_batch_dims(search) search = cnn.slim_conv2d(search, join_dim, kernel_size, scope='fc', **conv_kwargs) search = restore(search) template = cnn.get_value(template) template = tf.expand_dims(template, 1) # This is a broadcasting addition. Receptive field in template not tracked. output = cnn.pixelwise(lambda search: search + template, search) output = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), output) output = cnn.pixelwise(tf.nn.relu, output) mlp_kwargs = mlp_kwargs or {} output, restore = cnn.merge_batch_dims(output) output = cnn.mlp(output, num_layers=mlp_num_layers, num_hidden=mlp_num_hidden, num_outputs=mlp_num_outputs, trainable=trainable, **mlp_kwargs) output = restore(output) return output
def distance(template, search, is_training, trainable=True, use_mean=False, use_batch_norm=False, learn_gain=False, gain_init=1, scope='distance'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'distance'): search = cnn.as_tensor(search) # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) num_channels = template.shape[-1].value template_size = template.shape[-3:-1].as_list() ones = tf.ones(template_size + [num_channels, 1], tf.float32) dot_xy = cnn.diag_xcorr(search, template) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) if len(search.value.shape) == 5: dot_xx = tf.expand_dims(dot_xx, 1) sq_search = cnn.pixelwise(tf.square, search) sq_search, restore = cnn.merge_batch_dims(sq_search) dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') dot_yy = restore(dot_yy) # (x - y)**2 = x**2 - 2 x y + y**2 # sq_dist = dot_xx - 2 * dot_xy + dot_yy sq_dist = cnn.pixelwise_binary( lambda dot_xy, dot_yy: dot_xx - 2 * dot_xy + dot_yy, dot_xy, dot_yy) sq_dist = cnn.pixelwise( lambda sq_dist: tf.reduce_sum(sq_dist, axis=-1, keepdims=True), sq_dist) if use_mean: # Take root-mean-square of difference. num_elems = np.prod(template.shape[-3:].as_list()) sq_dist = cnn.pixelwise(lambda sq_dist: (1 / tf.to_float(num_elems)) * sq_dist, sq_dist) dist = cnn.pixelwise(tf.sqrt, sq_dist) return _calibrate(dist, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)
def cosine(template, search, is_training, trainable=True, use_batch_norm=False, gain_init=1, eps=1e-3, scope='cosine'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'cosine'): # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) dot_xy = cnn.channel_sum(cnn.diag_xcorr(search, template, padding='VALID')) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) sq_search = cnn.pixelwise(tf.square, search) ones = tf.ones_like(template) # TODO: Faster and less memory to use sum. dot_yy = cnn.channel_sum(cnn.diag_xcorr(sq_search, ones, padding='VALID')) # num_channels = template.shape[-1].value # template_size = template.shape[-3:-1].as_list() # ones = tf.ones(template_size + [num_channels, 1], tf.float32) # sq_search, restore = cnn.merge_batch_dims(sq_search) # dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') # dot_yy = restore(dot_yy) dot_xx = tf.expand_dims(dot_xx, 1) assert_ops = [tf.assert_non_negative(dot_xx, message='assert dot_xx non negative'), tf.assert_non_negative(dot_yy.value, message='assert dot_yy non negative')] with tf.control_dependencies(assert_ops): denom = cnn.pixelwise(lambda dot_yy: tf.sqrt(dot_xx * dot_yy), dot_yy) similarity = cnn.pixelwise_binary( lambda dot_xy, denom: dot_xy / (denom + eps), dot_xy, denom) # Gain is necessary here because similarity is always in [-1, 1]. return _calibrate(similarity, is_training, use_batch_norm, learn_gain=True, gain_init=gain_init, trainable=trainable)
def test_instantiate(self): '''Instantiates the join functions.''' for join_arch in join_nets.SINGLE_JOIN_FNS: with trySubTest(self, join_arch=join_arch): with tf.Graph().as_default(): join_fn = join_nets.BY_NAME[join_arch] if join_arch in join_nets.FULLY_CONNECTED_FNS: template_size = np.array([1, 1]) else: template_size = np.array([4, 4]) search_size = np.array([10, 10]) template_shape = [3] + list(template_size) + [16] search_shape = [3, 2] + list(search_size) + [16] template = tf.placeholder(tf.float32, template_shape, name='template') search = tf.placeholder(tf.float32, search_shape, name='search') is_training = tf.placeholder(tf.bool, (), name='is_training') output = join_fn(template, search, is_training) output = cnn.get_value(output) output_size = output.shape[-3:-1].as_list() self.assertAllEqual(output_size, search_size - template_size + 1) init_op = tf.global_variables_initializer() # with self.test_session() as sess: with tf.Session() as sess: sess.run(init_op) sess.run(output, feed_dict={ template: np.random.normal(size=template_shape), search: np.random.normal(size=search_shape), is_training: False, })
def start(self, features_init, run_opts, name=None): with tf.name_scope(name, 'start') as scope: im = features_init['image']['data'] aspect = features_init['aspect'] target_rect = features_init['rect'] mean_color = tf.reduce_mean(im, axis=(-3, -2), keepdims=True) with tf.variable_scope('appearance', reuse=False): template_rect = self._context_rect(target_rect, aspect, self.template_scale) template_im = self._crop(im, template_rect, self.template_size, mean_color) template_input = self._preproc(template_im) template_input = cnn.as_tensor(template_input, add_to_set=True) with tf.variable_scope('embed', reuse=False): template_feat, template_layers, feature_scope = self._embed_net( template_input, (False if not self.learn_appearance else run_opts['is_training'])) # Get names relative to this scope for loading pre-trained. # self._feature_vars = _global_variables_relative_to_scope(feature_scope) rf_template = template_feat.fields[template_input.value] template_feat = cnn.get_value(template_feat) feat_size = template_feat.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.template_size, feat_size, rf_template) # self._feature_saver = tf.train.Saver(self._feature_vars) with tf.name_scope('summary'): tf.summary.image('template', template_im) state = { 'run_opts': run_opts, 'aspect': aspect, 'image': im, 'rect': tf.identity(target_rect), 'template_init': tf.identity(template_feat), 'mean_color': tf.identity(mean_color), } return state
def next(self, features, labels, state, name=None, reset_position=False): ''' Args: reset_position: Keep the appearance model but reset the position. If this is true, then features['rect'] must be present. ''' with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope: im = features['image']['data'] run_opts = state['run_opts'] aspect = state['aspect'] prev_im = state['image'] mean_color = state['mean_color'] # If the label is not valid, there will be no loss for this frame. # However, the input image may still be processed. # In this case, adopt the previous rectangle as the "ground-truth". if self.mode in MODE_KEYS_SUPERVISED: gt_rect = tf.where(labels['valid'], labels['rect'], state['rect']) else: gt_rect = None # Use the previous rectangle. # This will be the ground-truth rect during training if `use_predictions` is false. prev_target_rect = state['rect'] # Coerce the aspect ratio of the rectangle to construct the search area. # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale) base_rect = model_util.coerce_aspect( prev_target_rect, aspect, aspect_method=self.aspect_method) # Apply perturbation to aspect-coerced "previous" rect (may be current gt). if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN: base_rect = tf.cond( run_opts['is_training'], lambda: siamfc.perturb(base_rect, **self.perturb_params), lambda: base_rect) search_rect = geom.grow_rect(self.search_scale, base_rect) # Coerce the aspect ratio of the rectangle to construct the context area. # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale) context_rect = geom.grow_rect(self.context_scale, base_rect) # Extract same rectangle in past and current images and feed into conv-net. context_curr = self._crop(im, context_rect, self.context_size, mean_color) context_prev = self._crop(prev_im, context_rect, self.context_size, mean_color) with tf.name_scope('summary_context'): tf.summary.image('curr', context_curr) tf.summary.image('prev', context_curr) motion = [context_curr ] if self.stateless else [context_curr, context_prev] motion = tf.stack(motion, axis=1) # How to obtain template from previous state? template_feat = state['template_init'] # Extract an image pyramid (use 1 scale when not in tracking mode). mid_scale = (self.num_scales - 1) // 2 if self.num_scales == 1: scales = tf.constant([1.0], dtype=tf.float32) else: scales = model_util.scale_range( tf.constant(self.num_scales), tf.to_float(self.log_scale_step)) search_ims, search_rects = self._crop_pyr(im, search_rect, self.search_size, scales, mean_color) with tf.name_scope('summary'): _image_sequence_summary('search', search_ims, elem_name='scale') with tf.variable_scope('appearance', reuse=False) as appearance_scope: # Extract features, perform search, get receptive field of response wrt image. search_input = self._preproc(search_ims) search_input = cnn.as_tensor(search_input, add_to_set=True) with tf.variable_scope('embed', reuse=True): search_feat, search_layers, _ = self._embed_net( search_input, (False if not self.learn_appearance else run_opts['is_training'])) rf_search = search_feat.fields[search_input.value] search_feat_size = search_feat.value.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, search_feat_size, rf_search) with tf.variable_scope('join', reuse=(self._num_frames >= 1)): join_fn = join_nets.BY_NAME[self.join_arch] if self.join_type == 'single': response = join_fn( template_feat, search_feat, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) elif self.join_type == 'multi': response = join_fn( template_feat, search_feat, self.multi_join_layers, template_layers, search_layers, search_input, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) else: raise ValueError('unknown join type: "{}"'.format( self.join_type)) rf_response = response.fields[search_input.value] response = cnn.get_value(response) response_size = response.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, response_size, rf_response) response = tf.verify_tensor_all_finite( response, 'output of xcorr is not finite') if self._num_frames == 0: # Define appearance model saver. if self.appearance_model_file: # Create the graph ops for the saver. var_list = appearance_scope.global_variables() var_list = {var.op.name: var for var in var_list} if self.appearance_scope_dst or self.appearance_scope_src: # Replace 'dst' with 'src'. # Caution: This string replacement is a little dangerous. var_list = { k.replace(self.appearance_scope_dst, self.appearance_scope_src, 1): v for k, v in var_list.items() } self._appearance_var_list = var_list self._appearance_saver = tf.train.Saver(var_list) # Post-process scores. with tf.variable_scope('output', reuse=(self._num_frames > 0)): if not self.learn_appearance: # TODO: Prevent batch-norm updates as well. # TODO: Set trainable=False for all variables above. response = tf.stop_gradient(response) # Regress response to translation and log(scale). output_shapes = {'translation': [2], 'log_scale': [1]} outputs = _output_net(response, motion, output_shapes, run_opts['is_training'], weight_decay=self.wd, use_response=self.output_use_response, use_images=self.output_use_images) _image_sequence_summary('response', model_util.colormap( tf.sigmoid(response), _COLORMAP), elem_name='scale') losses = {} if self.mode in MODE_KEYS_SUPERVISED: # Get ground-truth translation and scale relative to search window. gt_rect_in_search = geom.crop_rect(gt_rect, search_rect) gt_position, gt_rect_size = geom.rect_center_size( gt_rect_in_search) # Positions in real interval [0, 1] correspond to real interval [0, search_size]. # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size]. gt_translation = gt_position - 0.5 # Displacement relative to center. gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method) target_size_in_search = self.target_size / self.search_size # size = target_size * scale gt_scale = gt_size / target_size_in_search gt_log_scale = tf.log(gt_scale) if self.appearance_loss: target_size_in_response = self.target_size / rf_response.stride loss_name, loss = siamfc.compute_loss( response[:, mid_scale], target_size_in_response, **self.appearance_loss_params) losses[loss_name] = loss loss_name, loss = regress.compute_loss_vector( outputs['translation'], outputs['log_scale'], gt_translation, gt_log_scale, **self.loss_params) losses[loss_name] = loss if reset_position: # TODO: Something better! # TODO: Keep appearance loss even when `reset_position` is true? losses = {k: tf.zeros_like(v) for k, v in losses.items()} translation = outputs['translation'] # [b, 2] scale = tf.exp(outputs['log_scale']) # [b, 1] # Damp the scale update towards 1 (no change). # TODO: Should this be in log space? scale = self.scale_update_rate * scale + ( 1. - self.scale_update_rate) * 1. # Get rectangle in search image. prev_target_in_search = geom.crop_rect(prev_target_rect, search_rect) pred_in_search = _rect_translate_scale(prev_target_in_search, translation, scale) # Move from search back to original image. pred = geom.crop_rect(pred_in_search, geom.crop_inverse(search_rect)) # Limit size of object. pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0) # Rectangle to use in next frame for search area. # If using gt and rect not valid, use previous. if self.mode in MODE_KEYS_SUPERVISED: next_prev_rect = pred if self.use_predictions else gt_rect else: next_prev_rect = pred # outputs = {'rect': pred, 'score': confidence} outputs = {'rect': pred} state = { 'run_opts': run_opts, 'aspect': aspect, 'image': im, 'rect': next_prev_rect, 'template_init': state['template_init'], 'mean_color': state['mean_color'], } self._num_frames += 1 return outputs, state, losses
def test_output_equal(self): '''Compares output to library implementation of networks.''' # The desired_size may need to be chosen such that original network structure is valid. TestCase = collections.namedtuple('TestCase', ['kwargs', 'desired_size', 'end_point']) cases = { 'slim_alexnet_v2': TestCase( kwargs=dict( output_layer='conv5', output_act='relu', conv_padding='SAME', pool_padding='VALID'), desired_size=np.array([13, 13]), # 3 + (6 - 1) * 2 end_point='alexnet_v2/conv5', ), 'slim_resnet_v1_50': TestCase( kwargs=dict( num_blocks=4, conv_padding='SAME', pool_padding='SAME'), desired_size=np.array([3, 3]), end_point='resnet_v1_50/block4', ), 'slim_vgg_16': TestCase( kwargs=dict( output_layer='fc6', output_act='relu', conv_padding='SAME', pool_padding='VALID'), desired_size=np.array([1, 1]), end_point='vgg_16/fc6', ), } for feature_arch, test_case in cases.items(): graph = tf.Graph() sub_test = trySubTest(self, feature_arch=feature_arch) with sub_test, graph.as_default(): original_fn = globals()[feature_arch] feature_fn = functools.partial(feature_nets.BY_NAME[feature_arch], **test_case.kwargs) field = feature_nets.get_receptive_field(feature_fn) input_size = receptive_field.input_size(field, test_case.desired_size) input_shape = [None] + list(input_size) + [3] image = tf.placeholder(tf.float32, input_shape, name='image') with tf.variable_scope('net', reuse=False): _, end_points = original_fn(image, is_training=True) try: original = end_points['net/' + test_case.end_point] except KeyError as ex: raise ValueError('key not found ({}) in list: {}'.format( ex, sorted(end_points.keys()))) init_op = tf.global_variables_initializer() with tf.variable_scope('net', reuse=True): ours, _ = feature_fn(image, is_training=True) ours = cnn.get_value(ours) # self.assertEqual(original.shape.as_list(), ours.shape.as_list()) with self.session(graph=graph) as sess: sess.run(init_op) want, got = sess.run((original, ours), feed_dict={ image: np.random.uniform(size=[BATCH_LEN] + input_shape[1:]), }) self.assertAllClose(want, got)
def multi_xcorr(template, search, layer_names, template_layers, search_layers, search_image, is_training, trainable=True, use_final_conv=False, final_conv_params=None, hidden_conv_num_outputs=None, hidden_conv_activation='linear', use_batch_norm=False, use_mean=False, scope='multi_xcorr'): ''' Args: template_layers: Dict that maps names to tensors. search_layers: Dict that maps names to tensors. ''' with tf.variable_scope(scope, 'multi_xcorr'): template_layers = template_layers or {} search_layers = search_layers or {} final_conv_params = final_conv_params or {} assert 'final' not in layer_names scores = {} scores['final'] = _xcorr_general(template, search, is_training, trainable=trainable, use_pre_conv=use_final_conv, pre_conv_params=final_conv_params, use_mean=use_mean, use_batch_norm=use_batch_norm, scope='final_xcorr') final_conv = cnn.channel_sum(cnn.diag_xcorr(search, template)) for name in layer_names: template_layer = template_layers[name] search_layer = search_layers[name] # TODO: Add batch-norm to each cross-correlation? # Must be a 1x1 convolution to ensure that receptive fields of different layers align. scores[name] = _xcorr_general(template_layers[name], search_layers[name], is_training, trainable=trainable, use_pre_conv=True, pre_conv_params=dict( num_outputs=hidden_conv_num_outputs, kernel_size=1, stride=1, activation=hidden_conv_activation), use_mean=use_mean, use_batch_norm=use_batch_norm, scope=name + '_xcorr') # Upsample all to minimum stride. # Then take center-crop of minimum size. field_strides = {name: _unique(score.fields[cnn.get_value(search_image)].stride) for name, score in scores.items()} min_stride = min(field_strides.values()) for name in ['final'] + layer_names: stride = field_strides[name] if stride != min_stride: assert stride % min_stride == 0 relative = stride // min_stride scores[name] = cnn.upsample(scores[name], relative, method=tf.image.ResizeMethod.BILINEAR) sizes = {name: _unique(score.value.shape[-3:-1].as_list()) for name, score in scores.items()} min_size = min(sizes.values()) for name in ['final'] + layer_names: size = sizes[name] if (size - min_size) % 2 != 0: raise ValueError('remainder is not even: {} within {}'.format(min_size, size)) margin = (size - min_size) // 2 scores[name] = cnn.spatial_trim(scores[name], margin, margin) # TODO: How to handle calibration here? total = scores['final'] for name in layer_names: total += scores[name] return total
def _xcorr_general(template, search, is_training, trainable=True, use_pre_conv=False, pre_conv_params=None, learn_spatial_weight=False, weight_init_method='ones', reduce_channels=True, use_mean=False, use_batch_norm=False, learn_gain=False, gain_init=1, scope='xcorr'): '''Convolves template with search. Args: template: [b, h, w, c] search: [b, s, h, w, c] If use_batch_norm is true, then an output gain will always be incorporated. Otherwise, it will only be incorporated if learn_gain is true. When `learn_spatial_weight` is false: If `use_batch_norm` is true, `use_mean` should have no effect. When `learn_spatial_weight` is true: The `use_mean` parameter also controls the initialization of the spatial weights. This may have an effect on gradient descent, even if `use_batch_norm` is true. ''' with tf.variable_scope(scope, 'xcorr'): pre_conv_params = pre_conv_params or {} if use_pre_conv: template = _pre_conv(template, is_training, trainable=trainable, scope='pre', reuse=False, **pre_conv_params) search = _pre_conv(search, is_training, trainable=trainable, scope='pre', reuse=True, **pre_conv_params) # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) template_size = template.shape[-3:-1].as_list() # There are two separate issues here: # 1. Whether to make the initial output equal to the mean? # 2. How to share this between a constant multiplier and initialization? spatial_normalizer = 1 / np.prod(template_size) if learn_spatial_weight: if weight_init_method == 'mean': weight_init = spatial_normalizer elif weight_init_method == 'ones': weight_init = 1 else: raise ValueError('unknown weight init method: "{}"'.format(weight_init_method)) else: weight_init = 1 if use_mean: # Maintain property: # normalize_factor * weight_init = spatial_normalizer normalize_factor = spatial_normalizer / weight_init else: normalize_factor = 1 if learn_spatial_weight: # Initialize with spatial normalizer. spatial_weight = tf.get_variable( 'spatial_weight', template_size, tf.float32, initializer=tf.constant_initializer(weight_init), trainable=trainable) template *= tf.expand_dims(spatial_weight, -1) dot = cnn.diag_xcorr(search, template) dot = cnn.pixelwise(lambda dot: normalize_factor * dot, dot) if reduce_channels: dot = cnn.channel_mean(dot) if use_mean else cnn.channel_sum(dot) return _calibrate(dot, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)
def next(self, features, labels, state, name='timestep'): ''' Args: reset_position: Keep the appearance model but reset the position. If this is true, then features['rect'] must be present. ''' with tf.name_scope(name) as scope: im = features['image']['data'] run_opts = state['run_opts'] aspect = state['aspect'] mean_color = state['mean_color'] prev_im = state['image'] # If the label is not valid, there will be no loss for this frame. # However, the input image may still be processed. # In this case, adopt the previous rectangle as the "ground-truth". if self.mode in MODE_KEYS_SUPERVISED: gt_rect = tf.where(labels['valid'], labels['rect'], state['rect']) else: gt_rect = None # Use the previous rectangle. # This will be the ground-truth rect during training if `use_predictions` is false. prev_target_rect = state['rect'] # Coerce the aspect ratio of the rectangle to construct the context area. context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale) # Extract same rectangle in past and current images and feed into conv-net. context_curr = self._crop(im, context_rect, CONTEXT_SIZE, mean_color) context_prev = self._crop(prev_im, context_rect, CONTEXT_SIZE, mean_color) with tf.name_scope('summary_context'): tf.summary.image('curr', context_curr) tf.summary.image('prev', context_curr) ims = [context_curr ] if self.stateless else [context_curr, context_prev] ims = tf.stack(ims, axis=1) if self.output_form == 'discrete': output_shapes = { 'response': [ self.num_scales, self.response_size, self.response_size, 1 ] } elif self.output_form == 'vector': output_shapes = {'translation': [2], 'log_scale': [1]} else: raise ValueError( 'unknown output form: "{}"'.format(output_form)) # Extract features, perform search, get receptive field of response wrt image. ims_preproc = self._preproc(ims) with tf.variable_scope('motion', reuse=(self._num_frames > 0)): outputs = _motion_net(ims_preproc, output_shapes, run_opts['is_training'], weight_decay=self.wd) outputs = { k: tf.verify_tensor_all_finite(v, 'output "{}" not finite'.format(k)) for k, v in outputs.items() } losses = {} if self.mode in MODE_KEYS_SUPERVISED: # Get ground-truth translation and scale relative to context window. gt_rect_in_context = geom.crop_rect(gt_rect, context_rect) gt_position, gt_rect_size = geom.rect_center_size( gt_rect_in_context) gt_translation = gt_position - 0.5 # Displacement relative to center. gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method) # Scale is size relative to target_size. gt_scale = gt_size / (self.target_size / CONTEXT_SIZE) gt_log_scale = tf.log(gt_scale) if self.output_form == 'discrete': # base_translations = ((self.response_stride / self.context_size) * # util.displacement_from_center(self.response_size)) # scales = util.scale_range(tf.constant(self.num_scales), # tf.to_float(self.log_scale_step)) base_target_size = self.target_size / CONTEXT_SIZE translation_stride = self.response_stride / CONTEXT_SIZE loss_name, loss = compute_loss_discrete( outputs['response'], self.num_scales, translation_stride, self.log_scale_step, base_target_size, gt_translation, gt_size, **self.loss_params) else: loss_name, loss = compute_loss_vector( outputs['translation'], outputs['log_scale'], gt_translation, gt_log_scale, **self.loss_params) # if reset_position: # # TODO: Something better! # losses[loss_name] = tf.zeros_like(loss) # else: # losses[loss_name] = loss losses[loss_name] = loss if self.output_form == 'discrete': response = outputs['response'] scales = util.scale_range(tf.constant(self.num_scales), tf.to_float(self.log_scale_step)) # Use pyramid from loss function to obtain position. # Get relative translation and scale from response. # TODO: Upsample to higher resolution than original image? response_resize = cnn.get_value( cnn.upsample(response, self.response_stride, method=tf.image.ResizeMethod.BICUBIC)) response_final = response_resize # if self.learn_motion: # response_final = response_resize # else: # response_final = apply_motion_penalty( # response_resize, radius=self.window_radius * self.target_size, # **self.window_params) translation, scale, in_arg_max = util.find_peak_pyr( response_final, scales, eps_abs=self.arg_max_eps) scale = tf.expand_dims(scale, -1) # [b, 1] # Obtain translation in relative co-ordinates within search image. translation = 1 / tf.to_float(CONTEXT_SIZE) * translation # Get scalar representing confidence in prediction. # Use raw appearance score (before motion penalty). confidence = helpers.weighted_mean(response_resize, in_arg_max, axis=(-4, -3, -2)) else: translation = outputs['translation'] # [b, 2] scale = tf.exp(outputs['log_scale']) # [b, 1] # Damp the scale update towards 1 (no change). # TODO: Should this be in log space? scale = self.scale_update_rate * scale + ( 1. - self.scale_update_rate) * 1. # Get rectangle in search image. prev_target_in_context = geom.crop_rect(prev_target_rect, context_rect) pred_in_context = _rect_translate_scale(prev_target_in_context, translation, scale) # Move from search back to original image. pred = geom.crop_rect(pred_in_context, geom.crop_inverse(context_rect)) # Limit size of object. pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0) # Rectangle to use in next frame for search area. # If using gt and rect not valid, use previous. if self.mode in MODE_KEYS_SUPERVISED: next_prev_rect = pred if self.use_predictions else gt_rect else: next_prev_rect = pred self._num_frames += 1 # outputs = {'rect': pred, 'score': confidence} predictions = {'rect': pred} state = { 'run_opts': run_opts, 'aspect': aspect, # 'image': tf.image.resize_images(im, [self.image_size, self.image_size]), 'image': im, 'rect': next_prev_rect, 'mean_color': state['mean_color'], } return predictions, state, losses