def abs_diff(template, search, is_training, trainable=True, use_pre_conv=True, pre_conv_output_dim=256, reduce_channels=True, use_mean=False, use_batch_norm=False, scope='abs_diff'): ''' Requires that template is 1x1. Args: template: [b, ht, wt, c] search: [b, s, hs, ws, c] ''' with tf.variable_scope(scope, 'abs_diff'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) if use_pre_conv: # Reduce template to 1x1. kernel_size = template.value.shape[-3:-1].as_list() def pre_conv(x): x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x) x = cnn.pixelwise(tf.nn.relu, x) x, restore = cnn.merge_batch_dims(x) x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size, padding='VALID', activation_fn=None, normalizer_fn=slim.batch_norm, normalizer_params=dict(is_training=is_training), scope='conv') x = restore(x) return x # Perform pre-activation because the output layer did not have activations. with tf.variable_scope('pre_conv', reuse=False): template = pre_conv(template) with tf.variable_scope('pre_conv', reuse=True): search = pre_conv(search) template = cnn.get_value(template) template_size = template.shape[-3:-1].as_list() if template_size != [1, 1]: raise ValueError('template shape is not [1, 1]: {}'.format(template_size)) # Use broadcasting to perform element-wise operation. template = tf.expand_dims(template, 1) delta = cnn.pixelwise(lambda x: tf.abs(x - template), search) if reduce_channels: delta = cnn.channel_sum(delta) if use_mean: num_channels = template.shape[-1].value delta = cnn.pixelwise(lambda x: (1 / tf.to_float(num_channels)) * x, delta) # TODO: No bias if attaching more layers? return _calibrate(delta, is_training, use_batch_norm, learn_gain=False, gain_init=1, trainable=trainable)
def concat_fc(template, search, is_training, trainable=True, join_dim=128, mlp_num_outputs=1, mlp_num_layers=2, mlp_num_hidden=128, mlp_kwargs=None, scope=None): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' with tf.variable_scope(scope, 'concat_fc'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) # Instead of sliding-window concat, we do separate conv and sum the results. # Disable activation and normalizer. Perform these after the sum. kernel_size = template.value.shape[-3:-1].as_list() conv_kwargs = dict( padding='VALID', activation_fn=None, normalizer_fn=None, biases_initializer=None, # Disable bias because bnorm is performed later. ) with tf.variable_scope('template'): template = cnn.slim_conv2d(template, join_dim, kernel_size, scope='fc', **conv_kwargs) with tf.variable_scope('search'): search, restore = cnn.merge_batch_dims(search) search = cnn.slim_conv2d(search, join_dim, kernel_size, scope='fc', **conv_kwargs) search = restore(search) template = cnn.get_value(template) template = tf.expand_dims(template, 1) # This is a broadcasting addition. Receptive field in template not tracked. output = cnn.pixelwise(lambda search: search + template, search) output = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), output) output = cnn.pixelwise(tf.nn.relu, output) mlp_kwargs = mlp_kwargs or {} output, restore = cnn.merge_batch_dims(output) output = cnn.mlp(output, num_layers=mlp_num_layers, num_hidden=mlp_num_hidden, num_outputs=mlp_num_outputs, trainable=trainable, **mlp_kwargs) output = restore(output) return output
def _pre_conv(x, is_training, num_outputs=None, kernel_size=1, stride=1, padding='VALID', activation='linear', trainable=True, scope='preconv', reuse=None): ''' Args: num_outputs: If num_outputs is None, the input dimension is used. ''' # TODO: Support multi-scale. x = cnn.as_tensor(x) if not num_outputs: num_outputs = x.value.shape[-1].value return cnn.slim_conv2d(x, num_outputs, kernel_size, stride=stride, padding=padding, activation_fn=helpers.get_act(activation), normalizer_fn=None, # No batch-norm. trainable=trainable, scope=scope, reuse=reuse)
def distance(template, search, is_training, trainable=True, use_mean=False, use_batch_norm=False, learn_gain=False, gain_init=1, scope='distance'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'distance'): search = cnn.as_tensor(search) # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) num_channels = template.shape[-1].value template_size = template.shape[-3:-1].as_list() ones = tf.ones(template_size + [num_channels, 1], tf.float32) dot_xy = cnn.diag_xcorr(search, template) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) if len(search.value.shape) == 5: dot_xx = tf.expand_dims(dot_xx, 1) sq_search = cnn.pixelwise(tf.square, search) sq_search, restore = cnn.merge_batch_dims(sq_search) dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') dot_yy = restore(dot_yy) # (x - y)**2 = x**2 - 2 x y + y**2 # sq_dist = dot_xx - 2 * dot_xy + dot_yy sq_dist = cnn.pixelwise_binary( lambda dot_xy, dot_yy: dot_xx - 2 * dot_xy + dot_yy, dot_xy, dot_yy) sq_dist = cnn.pixelwise( lambda sq_dist: tf.reduce_sum(sq_dist, axis=-1, keepdims=True), sq_dist) if use_mean: # Take root-mean-square of difference. num_elems = np.prod(template.shape[-3:].as_list()) sq_dist = cnn.pixelwise(lambda sq_dist: (1 / tf.to_float(num_elems)) * sq_dist, sq_dist) dist = cnn.pixelwise(tf.sqrt, sq_dist) return _calibrate(dist, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)
def test_no_padding_by_default(self): '''Tests that feature functions with default options have zero padding.''' for feature_arch in feature_nets.NAMES: sub_test = trySubTest(self, feature_arch=feature_arch) with sub_test, tf.Graph().as_default(): feature_fn = feature_nets.BY_NAME[feature_arch] image = tf.placeholder(tf.float32, (None, None, None, 3), name='image') image = cnn.as_tensor(image, add_to_set=True) feat, _ = feature_fn(image, is_training=True) field = feat.fields[image.value] self.assertAllEqual(field.padding, [0, 0])
def get_receptive_field(feature_fn): ''' Args: feature_fn: Function that maps (image, is_training) to (image, end_points). ''' graph = tf.Graph() with graph.as_default(): image = tf.placeholder(tf.float32, (None, None, None, 3), name='image') is_training = tf.placeholder(tf.bool, (), name='is_training') image = cnn.as_tensor(image, add_to_set=True) feat, _ = feature_fn(image, is_training) return feat.fields[image.value]
def _embed_net( x, is_training, trainable, variables_collections, weight_decay=0, name='embed', # Additional arguments: arch='alexnet', arch_params=None, extra_conv_enable=False, extra_conv_params=None): ''' Args: x: Image of which to compute features. Shape [..., h, w, c] Returns: Output of network, intermediate layers, variable scope of feature net. The variables in the feature scope can be loaded from a pre-trained model. ''' with tf.name_scope(name) as scope: arch_params = arch_params or {} extra_conv_params = extra_conv_params or {} weight_decay = float(weight_decay) try: func = feature_nets.BY_NAME[arch] except KeyError: raise ValueError('unknown architecture: {}'.format(arch)) x = cnn.as_tensor(x) num_dims = len(x.value.shape) if num_dims > 4: merged, unmerge = helpers.merge_dims(x.value, 0, num_dims - 3) x = cnn.Tensor(merged, x.fields) with tf.variable_scope('feature') as feature_vs: x, end_points = func(x, is_training, trainable, variables_collections, weight_decay=weight_decay, **arch_params) if extra_conv_enable: with tf.variable_scope('extra'): x = _extra_conv(x, is_training, trainable, variables_collections, **extra_conv_params) if num_dims > 4: x = cnn.Tensor(unmerge(x.value, 0), x.fields) return x, end_points, feature_vs
def cosine(template, search, is_training, trainable=True, use_batch_norm=False, gain_init=1, eps=1e-3, scope='cosine'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'cosine'): # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) dot_xy = cnn.channel_sum(cnn.diag_xcorr(search, template, padding='VALID')) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) sq_search = cnn.pixelwise(tf.square, search) ones = tf.ones_like(template) # TODO: Faster and less memory to use sum. dot_yy = cnn.channel_sum(cnn.diag_xcorr(sq_search, ones, padding='VALID')) # num_channels = template.shape[-1].value # template_size = template.shape[-3:-1].as_list() # ones = tf.ones(template_size + [num_channels, 1], tf.float32) # sq_search, restore = cnn.merge_batch_dims(sq_search) # dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') # dot_yy = restore(dot_yy) dot_xx = tf.expand_dims(dot_xx, 1) assert_ops = [tf.assert_non_negative(dot_xx, message='assert dot_xx non negative'), tf.assert_non_negative(dot_yy.value, message='assert dot_yy non negative')] with tf.control_dependencies(assert_ops): denom = cnn.pixelwise(lambda dot_yy: tf.sqrt(dot_xx * dot_yy), dot_yy) similarity = cnn.pixelwise_binary( lambda dot_xy, denom: dot_xy / (denom + eps), dot_xy, denom) # Gain is necessary here because similarity is always in [-1, 1]. return _calibrate(similarity, is_training, use_batch_norm, learn_gain=True, gain_init=gain_init, trainable=trainable)
def _branch_net_receptive_field(arch='alexnet', arch_params=None, extra_conv_enable=False, extra_conv_params=None): arch_params = arch_params or {} graph = tf.Graph() with graph.as_default(): image = tf.placeholder(tf.float32, (None, None, None, 3), name='image') is_training = tf.placeholder(tf.bool, (), name='is_training') image = cnn.as_tensor(image, add_to_set=True) retvals = _embed_net(image, is_training, trainable=True, variables_collections=None, arch=arch, arch_params=arch_params, extra_conv_enable=extra_conv_enable, extra_conv_params=extra_conv_params) feat = retvals[0] return feat.fields[image.value]
def start(self, features_init, run_opts, name=None): with tf.name_scope(name, 'start') as scope: im = features_init['image']['data'] aspect = features_init['aspect'] target_rect = features_init['rect'] mean_color = tf.reduce_mean(im, axis=(-3, -2), keepdims=True) with tf.variable_scope('appearance', reuse=False): template_rect = self._context_rect(target_rect, aspect, self.template_scale) template_im = self._crop(im, template_rect, self.template_size, mean_color) template_input = self._preproc(template_im) template_input = cnn.as_tensor(template_input, add_to_set=True) with tf.variable_scope('embed', reuse=False): template_feat, template_layers, feature_scope = self._embed_net( template_input, (False if not self.learn_appearance else run_opts['is_training'])) # Get names relative to this scope for loading pre-trained. # self._feature_vars = _global_variables_relative_to_scope(feature_scope) rf_template = template_feat.fields[template_input.value] template_feat = cnn.get_value(template_feat) feat_size = template_feat.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.template_size, feat_size, rf_template) # self._feature_saver = tf.train.Saver(self._feature_vars) with tf.name_scope('summary'): tf.summary.image('template', template_im) state = { 'run_opts': run_opts, 'aspect': aspect, 'image': im, 'rect': tf.identity(target_rect), 'template_init': tf.identity(template_feat), 'mean_color': tf.identity(mean_color), } return state
def _extra_conv(x, is_training, trainable, variables_collections, num_outputs=None, kernel_size=1, stride=1, padding='VALID', activation='linear'): if not trainable: raise NotImplementedError('trainable not supported') x = cnn.as_tensor(x) if num_outputs is None: num_outputs = x.value.shape[-1].value with slim.arg_scope([slim.batch_norm], is_training=is_training): with slim.arg_scope([cnn.slim_conv2d], variables_collections=variables_collections): return cnn.slim_conv2d(x, num_outputs, kernel_size, stride=stride, padding=padding, activation_fn=helpers.get_act(activation))
def next(self, features, labels, state, name=None, reset_position=False): ''' Args: reset_position: Keep the appearance model but reset the position. If this is true, then features['rect'] must be present. ''' with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope: im = features['image']['data'] run_opts = state['run_opts'] aspect = state['aspect'] prev_im = state['image'] mean_color = state['mean_color'] # If the label is not valid, there will be no loss for this frame. # However, the input image may still be processed. # In this case, adopt the previous rectangle as the "ground-truth". if self.mode in MODE_KEYS_SUPERVISED: gt_rect = tf.where(labels['valid'], labels['rect'], state['rect']) else: gt_rect = None # Use the previous rectangle. # This will be the ground-truth rect during training if `use_predictions` is false. prev_target_rect = state['rect'] # Coerce the aspect ratio of the rectangle to construct the search area. # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale) base_rect = model_util.coerce_aspect( prev_target_rect, aspect, aspect_method=self.aspect_method) # Apply perturbation to aspect-coerced "previous" rect (may be current gt). if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN: base_rect = tf.cond( run_opts['is_training'], lambda: siamfc.perturb(base_rect, **self.perturb_params), lambda: base_rect) search_rect = geom.grow_rect(self.search_scale, base_rect) # Coerce the aspect ratio of the rectangle to construct the context area. # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale) context_rect = geom.grow_rect(self.context_scale, base_rect) # Extract same rectangle in past and current images and feed into conv-net. context_curr = self._crop(im, context_rect, self.context_size, mean_color) context_prev = self._crop(prev_im, context_rect, self.context_size, mean_color) with tf.name_scope('summary_context'): tf.summary.image('curr', context_curr) tf.summary.image('prev', context_curr) motion = [context_curr ] if self.stateless else [context_curr, context_prev] motion = tf.stack(motion, axis=1) # How to obtain template from previous state? template_feat = state['template_init'] # Extract an image pyramid (use 1 scale when not in tracking mode). mid_scale = (self.num_scales - 1) // 2 if self.num_scales == 1: scales = tf.constant([1.0], dtype=tf.float32) else: scales = model_util.scale_range( tf.constant(self.num_scales), tf.to_float(self.log_scale_step)) search_ims, search_rects = self._crop_pyr(im, search_rect, self.search_size, scales, mean_color) with tf.name_scope('summary'): _image_sequence_summary('search', search_ims, elem_name='scale') with tf.variable_scope('appearance', reuse=False) as appearance_scope: # Extract features, perform search, get receptive field of response wrt image. search_input = self._preproc(search_ims) search_input = cnn.as_tensor(search_input, add_to_set=True) with tf.variable_scope('embed', reuse=True): search_feat, search_layers, _ = self._embed_net( search_input, (False if not self.learn_appearance else run_opts['is_training'])) rf_search = search_feat.fields[search_input.value] search_feat_size = search_feat.value.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, search_feat_size, rf_search) with tf.variable_scope('join', reuse=(self._num_frames >= 1)): join_fn = join_nets.BY_NAME[self.join_arch] if self.join_type == 'single': response = join_fn( template_feat, search_feat, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) elif self.join_type == 'multi': response = join_fn( template_feat, search_feat, self.multi_join_layers, template_layers, search_layers, search_input, is_training=(False if not self.learn_appearance else run_opts['is_training']), trainable=self.learn_appearance, **self.join_params) else: raise ValueError('unknown join type: "{}"'.format( self.join_type)) rf_response = response.fields[search_input.value] response = cnn.get_value(response) response_size = response.shape[-3:-1].as_list() receptive_field.assert_center_alignment( self.search_size, response_size, rf_response) response = tf.verify_tensor_all_finite( response, 'output of xcorr is not finite') if self._num_frames == 0: # Define appearance model saver. if self.appearance_model_file: # Create the graph ops for the saver. var_list = appearance_scope.global_variables() var_list = {var.op.name: var for var in var_list} if self.appearance_scope_dst or self.appearance_scope_src: # Replace 'dst' with 'src'. # Caution: This string replacement is a little dangerous. var_list = { k.replace(self.appearance_scope_dst, self.appearance_scope_src, 1): v for k, v in var_list.items() } self._appearance_var_list = var_list self._appearance_saver = tf.train.Saver(var_list) # Post-process scores. with tf.variable_scope('output', reuse=(self._num_frames > 0)): if not self.learn_appearance: # TODO: Prevent batch-norm updates as well. # TODO: Set trainable=False for all variables above. response = tf.stop_gradient(response) # Regress response to translation and log(scale). output_shapes = {'translation': [2], 'log_scale': [1]} outputs = _output_net(response, motion, output_shapes, run_opts['is_training'], weight_decay=self.wd, use_response=self.output_use_response, use_images=self.output_use_images) _image_sequence_summary('response', model_util.colormap( tf.sigmoid(response), _COLORMAP), elem_name='scale') losses = {} if self.mode in MODE_KEYS_SUPERVISED: # Get ground-truth translation and scale relative to search window. gt_rect_in_search = geom.crop_rect(gt_rect, search_rect) gt_position, gt_rect_size = geom.rect_center_size( gt_rect_in_search) # Positions in real interval [0, 1] correspond to real interval [0, search_size]. # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size]. gt_translation = gt_position - 0.5 # Displacement relative to center. gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method) target_size_in_search = self.target_size / self.search_size # size = target_size * scale gt_scale = gt_size / target_size_in_search gt_log_scale = tf.log(gt_scale) if self.appearance_loss: target_size_in_response = self.target_size / rf_response.stride loss_name, loss = siamfc.compute_loss( response[:, mid_scale], target_size_in_response, **self.appearance_loss_params) losses[loss_name] = loss loss_name, loss = regress.compute_loss_vector( outputs['translation'], outputs['log_scale'], gt_translation, gt_log_scale, **self.loss_params) losses[loss_name] = loss if reset_position: # TODO: Something better! # TODO: Keep appearance loss even when `reset_position` is true? losses = {k: tf.zeros_like(v) for k, v in losses.items()} translation = outputs['translation'] # [b, 2] scale = tf.exp(outputs['log_scale']) # [b, 1] # Damp the scale update towards 1 (no change). # TODO: Should this be in log space? scale = self.scale_update_rate * scale + ( 1. - self.scale_update_rate) * 1. # Get rectangle in search image. prev_target_in_search = geom.crop_rect(prev_target_rect, search_rect) pred_in_search = _rect_translate_scale(prev_target_in_search, translation, scale) # Move from search back to original image. pred = geom.crop_rect(pred_in_search, geom.crop_inverse(search_rect)) # Limit size of object. pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0) # Rectangle to use in next frame for search area. # If using gt and rect not valid, use previous. if self.mode in MODE_KEYS_SUPERVISED: next_prev_rect = pred if self.use_predictions else gt_rect else: next_prev_rect = pred # outputs = {'rect': pred, 'score': confidence} outputs = {'rect': pred} state = { 'run_opts': run_opts, 'aspect': aspect, 'image': im, 'rect': next_prev_rect, 'template_init': state['template_init'], 'mean_color': state['mean_color'], } self._num_frames += 1 return outputs, state, losses
def all_pixel_pairs(template, search, is_training, trainable=True, operation='mul', reduce_channels=True, use_mean=True, use_batch_norm=False, learn_gain=False, gain_init=1, scope='all_pixel_pairs'): ''' Args: template: cnn.Tensor with shape [n, h_t, w_t, c] search: cnn.Tensor with shape [n, s, h_s, w_s, c] Returns: cnn.Tensor with shape [n, h_s, w_s, h_t * w_t] ''' with tf.variable_scope(scope, 'all_pixel_pairs'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) template_size = template.value.shape[-3:-1].as_list() num_channels = template.value.shape[-1].value # Break template into 1x1 patches. # Then "convolve" (multiply) each with the search image. t = template.value s = search.value # template becomes: [n, 1, ..., 1, 1, h_t, w_t, c] # search becomes: [n, s, ..., h_s, w_s, 1, 1, c] t = tf.expand_dims(t, 1) t = helpers.expand_dims_n(t, -4, 2) s = helpers.expand_dims_n(s, -2, 2) if operation == 'mul': p = t * s elif operation == 'abs_diff': p = tf.abs(t - s) else: raise ValueError('unknown operation: "{}"'.format(operation)) # if reduce_channels: # if use_mean: # p = tf.reduce_mean(p, axis=-1, keepdims=True) # else: # p = tf.reduce_sum(p, axis=-1, keepdims=True) # Merge the spatial dimensions of the template into features. # response becomes: [n, ..., h_s, w_s, h_t * w_t * c] p, _ = helpers.merge_dims(p, -3, None) pairs = cnn.Tensor(p, search.fields) # TODO: This initialization could be too small? normalizer = 1 / (np.prod(template_size) ** 2 * num_channels) if use_mean else 1 weights_shape = template_size + [np.prod(template_size) * num_channels, 1] weights = tf.get_variable('weights', weights_shape, tf.float32, initializer=tf.constant_initializer(normalizer), trainable=trainable) # TODO: Support depthwise_conv2d (keep channels). pairs, restore = cnn.merge_batch_dims(pairs) response = cnn.nn_conv2d(pairs, weights, strides=[1, 1, 1, 1], padding='VALID') response = restore(response) return _calibrate(response, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)