Esempio n. 1
0
def abs_diff(template, search, is_training,
             trainable=True,
             use_pre_conv=True,
             pre_conv_output_dim=256,
             reduce_channels=True,
             use_mean=False,
             use_batch_norm=False,
             scope='abs_diff'):
    '''
    Requires that template is 1x1.

    Args:
        template: [b, ht, wt, c]
        search: [b, s, hs, ws, c]
    '''
    with tf.variable_scope(scope, 'abs_diff'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)

        if use_pre_conv:
            # Reduce template to 1x1.
            kernel_size = template.value.shape[-3:-1].as_list()

            def pre_conv(x):
                x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x)
                x = cnn.pixelwise(tf.nn.relu, x)
                x, restore = cnn.merge_batch_dims(x)
                x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size,
                                    padding='VALID',
                                    activation_fn=None,
                                    normalizer_fn=slim.batch_norm,
                                    normalizer_params=dict(is_training=is_training),
                                    scope='conv')
                x = restore(x)
                return x

            # Perform pre-activation because the output layer did not have activations.
            with tf.variable_scope('pre_conv', reuse=False):
                template = pre_conv(template)
            with tf.variable_scope('pre_conv', reuse=True):
                search = pre_conv(search)

        template = cnn.get_value(template)
        template_size = template.shape[-3:-1].as_list()
        if template_size != [1, 1]:
            raise ValueError('template shape is not [1, 1]: {}'.format(template_size))
        # Use broadcasting to perform element-wise operation.
        template = tf.expand_dims(template, 1)
        delta = cnn.pixelwise(lambda x: tf.abs(x - template), search)
        if reduce_channels:
            delta = cnn.channel_sum(delta)
            if use_mean:
                num_channels = template.shape[-1].value
                delta = cnn.pixelwise(lambda x: (1 / tf.to_float(num_channels)) * x, delta)
        # TODO: No bias if attaching more layers?
        return _calibrate(delta, is_training, use_batch_norm, learn_gain=False, gain_init=1,
                          trainable=trainable)
Esempio n. 2
0
def concat_fc(template, search, is_training,
              trainable=True,
              join_dim=128,
              mlp_num_outputs=1,
              mlp_num_layers=2,
              mlp_num_hidden=128,
              mlp_kwargs=None,
              scope=None):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    with tf.variable_scope(scope, 'concat_fc'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)

        # Instead of sliding-window concat, we do separate conv and sum the results.
        # Disable activation and normalizer. Perform these after the sum.
        kernel_size = template.value.shape[-3:-1].as_list()
        conv_kwargs = dict(
            padding='VALID',
            activation_fn=None,
            normalizer_fn=None,
            biases_initializer=None,  # Disable bias because bnorm is performed later.
        )
        with tf.variable_scope('template'):
            template = cnn.slim_conv2d(template, join_dim, kernel_size,
                                       scope='fc', **conv_kwargs)
        with tf.variable_scope('search'):
            search, restore = cnn.merge_batch_dims(search)
            search = cnn.slim_conv2d(search, join_dim, kernel_size,
                                     scope='fc', **conv_kwargs)
            search = restore(search)

        template = cnn.get_value(template)
        template = tf.expand_dims(template, 1)
        # This is a broadcasting addition. Receptive field in template not tracked.
        output = cnn.pixelwise(lambda search: search + template, search)
        output = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), output)
        output = cnn.pixelwise(tf.nn.relu, output)

        mlp_kwargs = mlp_kwargs or {}
        output, restore = cnn.merge_batch_dims(output)
        output = cnn.mlp(output,
                         num_layers=mlp_num_layers,
                         num_hidden=mlp_num_hidden,
                         num_outputs=mlp_num_outputs,
                         trainable=trainable, **mlp_kwargs)
        output = restore(output)
        return output
Esempio n. 3
0
def _pre_conv(x, is_training,
              num_outputs=None,
              kernel_size=1,
              stride=1,
              padding='VALID',
              activation='linear',
              trainable=True,
              scope='preconv',
              reuse=None):
    '''
    Args:
        num_outputs: If num_outputs is None, the input dimension is used.
    '''
    # TODO: Support multi-scale.
    x = cnn.as_tensor(x)
    if not num_outputs:
        num_outputs = x.value.shape[-1].value
    return cnn.slim_conv2d(x, num_outputs, kernel_size,
                           stride=stride,
                           padding=padding,
                           activation_fn=helpers.get_act(activation),
                           normalizer_fn=None,  # No batch-norm.
                           trainable=trainable,
                           scope=scope,
                           reuse=reuse)
Esempio n. 4
0
def distance(template, search, is_training,
             trainable=True,
             use_mean=False,
             use_batch_norm=False,
             learn_gain=False,
             gain_init=1,
             scope='distance'):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    search = cnn.as_tensor(search)
    num_search_dims = len(search.value.shape)
    if num_search_dims != 5:
        raise ValueError('search should have 5 dims: {}'.format(num_search_dims))

    with tf.variable_scope(scope, 'distance'):
        search = cnn.as_tensor(search)
        # Discard receptive field of template and get underlying tf.Tensor.
        template = cnn.get_value(template)

        num_channels = template.shape[-1].value
        template_size = template.shape[-3:-1].as_list()
        ones = tf.ones(template_size + [num_channels, 1], tf.float32)

        dot_xy = cnn.diag_xcorr(search, template)
        dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True)
        if len(search.value.shape) == 5:
            dot_xx = tf.expand_dims(dot_xx, 1)
        sq_search = cnn.pixelwise(tf.square, search)
        sq_search, restore = cnn.merge_batch_dims(sq_search)
        dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID')
        dot_yy = restore(dot_yy)
        # (x - y)**2 = x**2 - 2 x y + y**2
        # sq_dist = dot_xx - 2 * dot_xy + dot_yy
        sq_dist = cnn.pixelwise_binary(
            lambda dot_xy, dot_yy: dot_xx - 2 * dot_xy + dot_yy, dot_xy, dot_yy)
        sq_dist = cnn.pixelwise(
            lambda sq_dist: tf.reduce_sum(sq_dist, axis=-1, keepdims=True), sq_dist)
        if use_mean:
            # Take root-mean-square of difference.
            num_elems = np.prod(template.shape[-3:].as_list())
            sq_dist = cnn.pixelwise(lambda sq_dist: (1 / tf.to_float(num_elems)) * sq_dist, sq_dist)
        dist = cnn.pixelwise(tf.sqrt, sq_dist)
        return _calibrate(dist, is_training, use_batch_norm, learn_gain, gain_init,
                          trainable=trainable)
Esempio n. 5
0
 def test_no_padding_by_default(self):
     '''Tests that feature functions with default options have zero padding.'''
     for feature_arch in feature_nets.NAMES:
         sub_test = trySubTest(self, feature_arch=feature_arch)
         with sub_test, tf.Graph().as_default():
             feature_fn = feature_nets.BY_NAME[feature_arch]
             image = tf.placeholder(tf.float32, (None, None, None, 3), name='image')
             image = cnn.as_tensor(image, add_to_set=True)
             feat, _ = feature_fn(image, is_training=True)
             field = feat.fields[image.value]
             self.assertAllEqual(field.padding, [0, 0])
Esempio n. 6
0
def get_receptive_field(feature_fn):
    '''
    Args:
        feature_fn: Function that maps (image, is_training) to (image, end_points).
    '''
    graph = tf.Graph()
    with graph.as_default():
        image = tf.placeholder(tf.float32, (None, None, None, 3), name='image')
        is_training = tf.placeholder(tf.bool, (), name='is_training')
        image = cnn.as_tensor(image, add_to_set=True)
        feat, _ = feature_fn(image, is_training)
        return feat.fields[image.value]
Esempio n. 7
0
def _embed_net(
        x,
        is_training,
        trainable,
        variables_collections,
        weight_decay=0,
        name='embed',
        # Additional arguments:
        arch='alexnet',
        arch_params=None,
        extra_conv_enable=False,
        extra_conv_params=None):
    '''
    Args:
        x: Image of which to compute features. Shape [..., h, w, c]

    Returns:
        Output of network, intermediate layers, variable scope of feature net.
        The variables in the feature scope can be loaded from a pre-trained model.
    '''
    with tf.name_scope(name) as scope:
        arch_params = arch_params or {}
        extra_conv_params = extra_conv_params or {}
        weight_decay = float(weight_decay)

        try:
            func = feature_nets.BY_NAME[arch]
        except KeyError:
            raise ValueError('unknown architecture: {}'.format(arch))

        x = cnn.as_tensor(x)
        num_dims = len(x.value.shape)
        if num_dims > 4:
            merged, unmerge = helpers.merge_dims(x.value, 0, num_dims - 3)
            x = cnn.Tensor(merged, x.fields)

        with tf.variable_scope('feature') as feature_vs:
            x, end_points = func(x,
                                 is_training,
                                 trainable,
                                 variables_collections,
                                 weight_decay=weight_decay,
                                 **arch_params)

        if extra_conv_enable:
            with tf.variable_scope('extra'):
                x = _extra_conv(x, is_training, trainable,
                                variables_collections, **extra_conv_params)
        if num_dims > 4:
            x = cnn.Tensor(unmerge(x.value, 0), x.fields)
        return x, end_points, feature_vs
Esempio n. 8
0
def cosine(template, search, is_training,
           trainable=True,
           use_batch_norm=False,
           gain_init=1,
           eps=1e-3,
           scope='cosine'):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    search = cnn.as_tensor(search)
    num_search_dims = len(search.value.shape)
    if num_search_dims != 5:
        raise ValueError('search should have 5 dims: {}'.format(num_search_dims))

    with tf.variable_scope(scope, 'cosine'):
        # Discard receptive field of template and get underlying tf.Tensor.
        template = cnn.get_value(template)

        dot_xy = cnn.channel_sum(cnn.diag_xcorr(search, template, padding='VALID'))
        dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True)

        sq_search = cnn.pixelwise(tf.square, search)
        ones = tf.ones_like(template)  # TODO: Faster and less memory to use sum.
        dot_yy = cnn.channel_sum(cnn.diag_xcorr(sq_search, ones, padding='VALID'))
        # num_channels = template.shape[-1].value
        # template_size = template.shape[-3:-1].as_list()
        # ones = tf.ones(template_size + [num_channels, 1], tf.float32)
        # sq_search, restore = cnn.merge_batch_dims(sq_search)
        # dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID')
        # dot_yy = restore(dot_yy)

        dot_xx = tf.expand_dims(dot_xx, 1)
        assert_ops = [tf.assert_non_negative(dot_xx, message='assert dot_xx non negative'),
                      tf.assert_non_negative(dot_yy.value, message='assert dot_yy non negative')]
        with tf.control_dependencies(assert_ops):
            denom = cnn.pixelwise(lambda dot_yy: tf.sqrt(dot_xx * dot_yy), dot_yy)
        similarity = cnn.pixelwise_binary(
            lambda dot_xy, denom: dot_xy / (denom + eps), dot_xy, denom)
        # Gain is necessary here because similarity is always in [-1, 1].
        return _calibrate(similarity, is_training, use_batch_norm,
                          learn_gain=True,
                          gain_init=gain_init,
                          trainable=trainable)
Esempio n. 9
0
def _branch_net_receptive_field(arch='alexnet',
                                arch_params=None,
                                extra_conv_enable=False,
                                extra_conv_params=None):
    arch_params = arch_params or {}

    graph = tf.Graph()
    with graph.as_default():
        image = tf.placeholder(tf.float32, (None, None, None, 3), name='image')
        is_training = tf.placeholder(tf.bool, (), name='is_training')
        image = cnn.as_tensor(image, add_to_set=True)
        retvals = _embed_net(image,
                             is_training,
                             trainable=True,
                             variables_collections=None,
                             arch=arch,
                             arch_params=arch_params,
                             extra_conv_enable=extra_conv_enable,
                             extra_conv_params=extra_conv_params)
        feat = retvals[0]
        return feat.fields[image.value]
Esempio n. 10
0
    def start(self, features_init, run_opts, name=None):
        with tf.name_scope(name, 'start') as scope:
            im = features_init['image']['data']
            aspect = features_init['aspect']
            target_rect = features_init['rect']
            mean_color = tf.reduce_mean(im, axis=(-3, -2), keepdims=True)

            with tf.variable_scope('appearance', reuse=False):
                template_rect = self._context_rect(target_rect, aspect,
                                                   self.template_scale)
                template_im = self._crop(im, template_rect, self.template_size,
                                         mean_color)
                template_input = self._preproc(template_im)
                template_input = cnn.as_tensor(template_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=False):
                    template_feat, template_layers, feature_scope = self._embed_net(
                        template_input, (False if not self.learn_appearance
                                         else run_opts['is_training']))
                    # Get names relative to this scope for loading pre-trained.
                    # self._feature_vars = _global_variables_relative_to_scope(feature_scope)
                rf_template = template_feat.fields[template_input.value]
                template_feat = cnn.get_value(template_feat)
                feat_size = template_feat.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.template_size, feat_size, rf_template)

            # self._feature_saver = tf.train.Saver(self._feature_vars)

            with tf.name_scope('summary'):
                tf.summary.image('template', template_im)

            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': tf.identity(target_rect),
                'template_init': tf.identity(template_feat),
                'mean_color': tf.identity(mean_color),
            }
            return state
Esempio n. 11
0
def _extra_conv(x,
                is_training,
                trainable,
                variables_collections,
                num_outputs=None,
                kernel_size=1,
                stride=1,
                padding='VALID',
                activation='linear'):
    if not trainable:
        raise NotImplementedError('trainable not supported')

    x = cnn.as_tensor(x)
    if num_outputs is None:
        num_outputs = x.value.shape[-1].value
    with slim.arg_scope([slim.batch_norm], is_training=is_training):
        with slim.arg_scope([cnn.slim_conv2d],
                            variables_collections=variables_collections):
            return cnn.slim_conv2d(x,
                                   num_outputs,
                                   kernel_size,
                                   stride=stride,
                                   padding=padding,
                                   activation_fn=helpers.get_act(activation))
Esempio n. 12
0
    def next(self, features, labels, state, name=None, reset_position=False):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            prev_im = state['image']
            mean_color = state['mean_color']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the search area.
            # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale)
            base_rect = model_util.coerce_aspect(
                prev_target_rect, aspect, aspect_method=self.aspect_method)
            # Apply perturbation to aspect-coerced "previous" rect (may be current gt).
            if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN:
                base_rect = tf.cond(
                    run_opts['is_training'],
                    lambda: siamfc.perturb(base_rect, **self.perturb_params),
                    lambda: base_rect)
            search_rect = geom.grow_rect(self.search_scale, base_rect)

            # Coerce the aspect ratio of the rectangle to construct the context area.
            # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale)
            context_rect = geom.grow_rect(self.context_scale, base_rect)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, self.context_size,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, self.context_size,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            motion = [context_curr
                      ] if self.stateless else [context_curr, context_prev]
            motion = tf.stack(motion, axis=1)

            # How to obtain template from previous state?
            template_feat = state['template_init']

            # Extract an image pyramid (use 1 scale when not in tracking mode).
            mid_scale = (self.num_scales - 1) // 2
            if self.num_scales == 1:
                scales = tf.constant([1.0], dtype=tf.float32)
            else:
                scales = model_util.scale_range(
                    tf.constant(self.num_scales),
                    tf.to_float(self.log_scale_step))
            search_ims, search_rects = self._crop_pyr(im, search_rect,
                                                      self.search_size, scales,
                                                      mean_color)

            with tf.name_scope('summary'):
                _image_sequence_summary('search',
                                        search_ims,
                                        elem_name='scale')

            with tf.variable_scope('appearance',
                                   reuse=False) as appearance_scope:
                # Extract features, perform search, get receptive field of response wrt image.
                search_input = self._preproc(search_ims)
                search_input = cnn.as_tensor(search_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=True):
                    search_feat, search_layers, _ = self._embed_net(
                        search_input, (False if not self.learn_appearance else
                                       run_opts['is_training']))
                rf_search = search_feat.fields[search_input.value]
                search_feat_size = search_feat.value.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, search_feat_size, rf_search)

                with tf.variable_scope('join', reuse=(self._num_frames >= 1)):
                    join_fn = join_nets.BY_NAME[self.join_arch]
                    if self.join_type == 'single':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    elif self.join_type == 'multi':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            self.multi_join_layers,
                            template_layers,
                            search_layers,
                            search_input,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    else:
                        raise ValueError('unknown join type: "{}"'.format(
                            self.join_type))
                rf_response = response.fields[search_input.value]
                response = cnn.get_value(response)
                response_size = response.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, response_size, rf_response)
                response = tf.verify_tensor_all_finite(
                    response, 'output of xcorr is not finite')

            if self._num_frames == 0:
                # Define appearance model saver.
                if self.appearance_model_file:
                    # Create the graph ops for the saver.
                    var_list = appearance_scope.global_variables()
                    var_list = {var.op.name: var for var in var_list}
                    if self.appearance_scope_dst or self.appearance_scope_src:
                        # Replace 'dst' with 'src'.
                        # Caution: This string replacement is a little dangerous.
                        var_list = {
                            k.replace(self.appearance_scope_dst,
                                      self.appearance_scope_src, 1): v
                            for k, v in var_list.items()
                        }
                    self._appearance_var_list = var_list
                    self._appearance_saver = tf.train.Saver(var_list)

            # Post-process scores.
            with tf.variable_scope('output', reuse=(self._num_frames > 0)):
                if not self.learn_appearance:
                    # TODO: Prevent batch-norm updates as well.
                    # TODO: Set trainable=False for all variables above.
                    response = tf.stop_gradient(response)

                # Regress response to translation and log(scale).
                output_shapes = {'translation': [2], 'log_scale': [1]}
                outputs = _output_net(response,
                                      motion,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd,
                                      use_response=self.output_use_response,
                                      use_images=self.output_use_images)

            _image_sequence_summary('response',
                                    model_util.colormap(
                                        tf.sigmoid(response), _COLORMAP),
                                    elem_name='scale')

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to search window.
                gt_rect_in_search = geom.crop_rect(gt_rect, search_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_search)
                # Positions in real interval [0, 1] correspond to real interval [0, search_size].
                # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size].
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                target_size_in_search = self.target_size / self.search_size
                # size = target_size * scale
                gt_scale = gt_size / target_size_in_search
                gt_log_scale = tf.log(gt_scale)

                if self.appearance_loss:
                    target_size_in_response = self.target_size / rf_response.stride
                    loss_name, loss = siamfc.compute_loss(
                        response[:, mid_scale], target_size_in_response,
                        **self.appearance_loss_params)
                    losses[loss_name] = loss

                loss_name, loss = regress.compute_loss_vector(
                    outputs['translation'], outputs['log_scale'],
                    gt_translation, gt_log_scale, **self.loss_params)
                losses[loss_name] = loss

                if reset_position:
                    # TODO: Something better!
                    # TODO: Keep appearance loss even when `reset_position` is true?
                    losses = {k: tf.zeros_like(v) for k, v in losses.items()}

            translation = outputs['translation']  # [b, 2]
            scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_search = geom.crop_rect(prev_target_rect,
                                                   search_rect)
            pred_in_search = _rect_translate_scale(prev_target_in_search,
                                                   translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_search,
                                  geom.crop_inverse(search_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            # outputs = {'rect': pred, 'score': confidence}
            outputs = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': next_prev_rect,
                'template_init': state['template_init'],
                'mean_color': state['mean_color'],
            }
            self._num_frames += 1
            return outputs, state, losses
Esempio n. 13
0
def all_pixel_pairs(template, search, is_training,
                    trainable=True,
                    operation='mul',
                    reduce_channels=True,
                    use_mean=True,
                    use_batch_norm=False,
                    learn_gain=False,
                    gain_init=1,
                    scope='all_pixel_pairs'):
    '''
    Args:
        template: cnn.Tensor with shape [n, h_t, w_t, c]
        search: cnn.Tensor with shape [n, s, h_s, w_s, c]

    Returns:
        cnn.Tensor with shape [n, h_s, w_s, h_t * w_t]
    '''
    with tf.variable_scope(scope, 'all_pixel_pairs'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)
        template_size = template.value.shape[-3:-1].as_list()
        num_channels = template.value.shape[-1].value

        # Break template into 1x1 patches.
        # Then "convolve" (multiply) each with the search image.
        t = template.value
        s = search.value
        # template becomes: [n, 1, ...,   1,   1, h_t, w_t, c]
        # search becomes:   [n, s, ..., h_s, w_s,   1,   1, c]
        t = tf.expand_dims(t, 1)
        t = helpers.expand_dims_n(t, -4, 2)
        s = helpers.expand_dims_n(s, -2, 2)
        if operation == 'mul':
            p = t * s
        elif operation == 'abs_diff':
            p = tf.abs(t - s)
        else:
            raise ValueError('unknown operation: "{}"'.format(operation))

        # if reduce_channels:
        #     if use_mean:
        #         p = tf.reduce_mean(p, axis=-1, keepdims=True)
        #     else:
        #         p = tf.reduce_sum(p, axis=-1, keepdims=True)
        # Merge the spatial dimensions of the template into features.
        # response becomes: [n, ..., h_s, w_s, h_t * w_t * c]
        p, _ = helpers.merge_dims(p, -3, None)
        pairs = cnn.Tensor(p, search.fields)

        # TODO: This initialization could be too small?
        normalizer = 1 / (np.prod(template_size) ** 2 * num_channels) if use_mean else 1
        weights_shape = template_size + [np.prod(template_size) * num_channels, 1]
        weights = tf.get_variable('weights', weights_shape, tf.float32,
                                  initializer=tf.constant_initializer(normalizer),
                                  trainable=trainable)
        # TODO: Support depthwise_conv2d (keep channels).
        pairs, restore = cnn.merge_batch_dims(pairs)
        response = cnn.nn_conv2d(pairs, weights, strides=[1, 1, 1, 1], padding='VALID')
        response = restore(response)

        return _calibrate(response, is_training, use_batch_norm, learn_gain, gain_init,
                          trainable=trainable)