Ejemplo n.º 1
0
def get_exemplar_images(images, exemplar_size, targets_pos=None):
    """Crop exemplar image from input images"""
    with tf.name_scope('get_exemplar_image'):
        batch_size, x_height, x_width = images.get_shape().as_list()[:3]
        z_height, z_width = exemplar_size

        if targets_pos is None:
            # crop from the center
            target_pos_single = [[get_center(x_height), get_center(x_width)]]
            targets_pos_ = tf.tile(target_pos_single, [batch_size, 1])
        else:
            targets_pos_ = targets_pos

        # convert to top-left corner based coordinates
        top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height)))
        bottom = tf.to_int32(top + z_height)
        left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width)))
        right = tf.to_int32(left + z_width)

        def _slice(x):
            f, t, l, b, r = x
            c = f[t:b, l:r]
            return c

        exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype)
        exemplar_img.set_shape([batch_size, z_height, z_width, 3])
        return exemplar_img
Ejemplo n.º 2
0
    def build_extract_crops(self):
        model_config = self.model_config
        track_config = self.track_config
        context_amount = 0.5

        size_z = model_config['z_image_size']
        size_x = model_config['x_image_size']

        num_scales = track_config['num_scales']

        scales = np.arange(num_scales) - get_center(num_scales)
        assert np.sum(scales) == 0, 'scales should be symmetric'
        assert track_config['scale_step'] >= 1.0, 'scale step should be >= 1.0'
        search_factors = [track_config['scale_step']**x for x in scales]

        frame_sz = tf.shape(self.image)
        target_yx = self.target_bbox_feed[0:2]
        target_size = self.target_bbox_feed[2:4]
        avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')

        # Compute base values
        base_z_size = target_size
        base_z_context_size = base_z_size + context_amount * tf.reduce_sum(
            base_z_size)
        base_s_z = tf.sqrt(
            tf.reduce_prod(base_z_context_size))  # Canoical size
        base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
        d_search = (size_x - size_z) / 2.0
        base_pad = tf.div(d_search, base_scale_z)
        base_s_x = base_s_z + 2 * base_pad
        base_scale_x = tf.div(tf.to_float(size_x), base_s_x)

        boxes = []
        for factor in search_factors:
            s_x = factor * base_s_x
            frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
            topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
            bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
            box = tf.concat([topleft, bottomright], axis=0)
            boxes.append(box)
        boxes = tf.stack(boxes)

        scale_xs = []
        for factor in search_factors:
            scale_x = base_scale_x / factor
            scale_xs.append(scale_x)
        self.scale_xs = tf.stack(scale_xs)

        image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
        image_cropped = tf.image.crop_and_resize(
            image_minus_avg,
            boxes,
            box_ind=tf.zeros((track_config['num_scales']), tf.int32),
            crop_size=[size_x, size_x])
        self.images = image_cropped + avg_chan
Ejemplo n.º 3
0
def convert_bbox_format(bbox, to):
    x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
    if to == 'top-left-based':
        x -= get_center(target_width)
        y -= get_center(target_height)
    elif to == 'center-based':
        y += get_center(target_height)
        x += get_center(target_width)
    else:
        raise ValueError("Bbox format: {} was not recognized".format(to))
    return Rectangle(x, y, target_width, target_height)
Ejemplo n.º 4
0
def convert_bbox(bbox, to, offsetx, offsety):
    x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
    if to == 'top-left-based':
        x -= get_center(target_width)
        y -= get_center(target_height)
    elif to == 'center-based':
        y += get_center(target_height)
        x += get_center(target_width)
        x += offsetx
        y += offsety

    else:
        raise NotImplementedError
    return Rectangle(x, y, target_width, target_height)
Ejemplo n.º 5
0
    def build_template(self):
        model_config = self.model_config
        track_config = self.track_config
        examplar_images = get_exemplar_images(
            self.images,
            [model_config['z_image_size'], model_config['z_image_size']])
        templates = self.get_image_embedding(examplar_images, deform=False)
        center_scale = int(get_center(track_config['num_scales']))
        center_template = tf.identity(templates[center_scale])
        templates = tf.stack(
            [center_template for _ in range(model_config['batch_size'])])

        with tf.variable_scope('target_template'):
            template_fn = template_factory.get_network_fn(
                model_config['template_name'],
                weight_decay=model_config['weight_decay'],
                is_training=False)
            templates, _ = template_fn(templates)

            # Store template in Variable such that we don't have to feed this template.
            with tf.variable_scope('State'):
                state = tf.get_variable('exemplar',
                                        initializer=tf.zeros_like(templates),
                                        trainable=False)
                with tf.control_dependencies([templates]):
                    self.init = tf.assign(state,
                                          templates,
                                          validate_shape=True)
                self.templates = state
Ejemplo n.º 6
0
    def __init__(self, model, model_config, track_config):
        """Initializes the tracker.

    Args:
      model: Object encapsulating a trained track model. Must have
        methods inference_step(). For example, an instance of
        InferenceWrapperBase.
      model_config: track model configurations.
      track_config: tracking configurations.
    """
        self.model = model
        self.model_config = model_config
        self.track_config = track_config

        self.z_image_size = model_config['z_image_size']
        self.x_image_size = model_config['x_image_size']
        self.r_embed_size = model_config['r_embed_size']
        self.r_image_size = model_config['u_image_size']

        self.num_scales = track_config['num_scales']
        self.log_level = track_config['log_level']
        logging.info('track num scales -- {}'.format(
            track_config['num_scales']))
        scales = np.arange(self.num_scales) - get_center(self.num_scales)
        self.search_factors = [
            self.track_config['scale_step']**x for x in scales
        ]

        # Cosine window
        window = np.dot(np.expand_dims(np.hanning(self.r_image_size), 1),
                        np.expand_dims(np.hanning(self.r_image_size), 0))
        self.window = window / np.sum(window)  # normalize window
Ejemplo n.º 7
0
def extract_patch(inputs, patch_size, top_left=None):
    """Extract patch from inputs Tensor

  args:
    inputs: Tensor of shape [batch, height, width, feature_num]
    patch_size: [height, width]
    top_left: patch top_left positions in the input tensor, of shape [batch, 2]

  return:
    patches of shape [batch, height, width, feature_num]
  """
    with tf.name_scope('extract_patch'):
        batch_size, x_height, x_width, feat_num = inputs.get_shape().as_list()
        z_height, z_width = patch_size

        if top_left is None:
            pos_single = [[get_center(x_height), get_center(x_width)]]
            patch_center_ = tf.tile(pos_single, [batch_size, 1])

            # convert to top-left corner based coordinates
            top = tf.to_int32(
                tf.round(patch_center_[:, 0] - get_center(z_height)))
            left = tf.to_int32(
                tf.round(patch_center_[:, 1] - get_center(z_width)))
        else:
            top = tf.to_int32(top_left[:, 0])
            left = tf.to_int32(top_left[:, 1])

        bottom = tf.to_int32(top + z_height)
        right = tf.to_int32(left + z_width)

        def _slice(x):
            f, t, l, b, r = x
            c = f[t:b, l:r]
            return c

        patch = tf.map_fn(_slice, (inputs, top, left, bottom, right),
                          dtype=inputs.dtype)

        # Restore some shape
        patch.set_shape([batch_size, z_height, z_width, feat_num])
        return patch
Ejemplo n.º 8
0
def get_subwindow_avg(im, pos, model_sz, original_sz):
    # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower
    avg_chans = [
        np.mean(im[:, :, 0]),
        np.mean(im[:, :, 1]),
        np.mean(im[:, :, 2])
    ]
    if not original_sz:
        original_sz = model_sz
    sz = original_sz
    im_sz = im.shape
    # make sure the size is not too small
    assert im_sz[0] > 2 and im_sz[1] > 2
    c = [get_center(s) for s in sz]

    # check out-of-bounds coordinates, and set them to avg_chans
    context_xmin = np.int(np.round(pos[1] - c[1]))
    context_xmax = np.int(context_xmin + sz[1] - 1)
    context_ymin = np.int(np.round(pos[0] - c[0]))
    context_ymax = np.int(context_ymin + sz[0] - 1)
    left_pad = np.int(np.maximum(0, -context_xmin))
    top_pad = np.int(np.maximum(0, -context_ymin))
    right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))
    bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))

    context_xmin = context_xmin + left_pad
    context_xmax = context_xmax + left_pad
    context_ymin = context_ymin + top_pad
    context_ymax = context_ymax + top_pad
    if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
        R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[0]))
        G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[1]))
        B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[2]))

        im = np.stack((R, G, B), axis=2)

    im_patch_original = im[context_ymin:context_ymax + 1,
                           context_xmin:context_xmax + 1, :]
    if not np.array_equal(model_sz, original_sz):
        im_patch = imresize(im_patch_original, model_sz, interp='bilinear')
    else:
        im_patch = im_patch_original
    return im_patch, left_pad, top_pad, right_pad, bottom_pad
Ejemplo n.º 9
0
    def build_search_image(self, image, bbox, scale_factor):
        context_amount = self.context_amount
        size_z = self.z_image_size
        size_x = self.x_image_size

        # image: [H,W,3]
        # bbox: [4], cy,cx,height,width
        frame_sz = tf.shape(image)
        target_yx = bbox[0:2] #y,x
        target_size = bbox[2:4] # height, width
        avg_chan = tf.reduce_mean(image, axis=(0, 1), name='avg_chan')

        # Compute base values
        base_z_size = target_size
        base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size) # w+2p, h+2p
        base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size))  # Canonical size
        base_scale_z = tf.div(tf.to_float(size_z), base_s_z) # s = sqrt(A**2/((w+2p)(h+2p))
        d_search = (size_x - size_z) / 2.0
        base_pad = tf.div(d_search, base_scale_z)
        base_s_x = base_s_z + 2 * base_pad
        base_scale_x = tf.div(tf.to_float(size_x), base_s_x)

        s_x = scale_factor * base_s_x
        frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
        topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
        bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
        crop_box = tf.concat([topleft, bottomright], axis=0)
        scale_x = base_scale_x / scale_factor

        image_minus_avg = tf.expand_dims(image - avg_chan, 0)
        image_cropped = tf.image.crop_and_resize(image_minus_avg, crop_box[None],
                                                 box_ind=tf.zeros((1), tf.int32),
                                                 crop_size=[size_x, size_x])
        search_image = image_cropped + avg_chan
        search_image = search_image[0] # [1,H,W,3] --> [H,W,3]
        return search_image, scale_x, crop_box
Ejemplo n.º 10
0
    def __init__(self, siamese_model, config):
        self.siamese_model = siamese_model
        self.config = config

        self.num_scales = self.config.num_scales
        logging.info('track num scales -- {}'.format(self.num_scales))
        scales = np.arange(self.num_scales) - get_center(self.num_scales)
        self.search_factors = [self.config.scale_step ** x for x in scales]

        self.x_image_size = self.config.x_image_size  # Search image size
        self.window = None  # Cosine window
        self.log_level = self.config.log_level

        if config.net_type == 'cfcf':
            self.update_template = True
        else:
            self.update_template = False
Ejemplo n.º 11
0
    def track(self, sess, first_bbox, frames, logdir='/tmp', write_summary=True):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array([get_center(self.x_image_size),
                                  get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(get_center(self.num_scales)))

        include_first = False
        logging.info('Tracking include first -- {}'.format(include_first))

        if write_summary:
            summary_writer = tf.summary.FileWriter(
                    osp.join(logdir, 'summary'), graph=sess.graph)
            self.siamese_model.build_summary(summary_writer)

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                             current_target_state.bbox.height, current_target_state.bbox.width]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response, axis=(1, 2))
                    penalties = self.config.scale_penalty * np.ones((self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]

                if self.update_template:
                    mmr = outputs['MMRs'][best_scale]
                    if mmr > self.config.mmr_thresh:
                        print('update templates MMRs={}'.format(mmr))
                        self.siamese_model.update(sess, input_feed)

                with np.errstate(all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                                  np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.config.window_influence
                response = (1 - window_influence) * response + window_influence * self.window

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.config.upsample_factor
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.config.embed_stride
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.config.scale_damp  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                          cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
                    bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                          [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])
            reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based')
            reported_bboxs.append(reported_bbox)
        #--- END OF FRAME
        return reported_bboxs
Ejemplo n.º 12
0
def _construct_gt_response(response_size, batch_size, stride, gt_config=None):
    """Construct a batch of 2D ground truth response

  Args:
    response_size: a list or tuple with two elements [ho, wo]
    batch_size: an integer e.g. 16
    stride: embedding stride e.g. 8
    gt_config: configurations for ground truth generation

  return:
    a float tensor of shape [batch_size] +  response_size
  """
    with tf.variable_scope('construct_gt') as ct_scope:
        ho = response_size[0]
        wo = response_size[1]
        y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho)
        x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo)
        [Y, X] = tf.meshgrid(y, x)

        gt_type = gt_config['type']
        if gt_type == 'gaussian':

            def _gaussian_2d(X, Y, sigma):
                x0, y0 = 0, 0  # the target position, i.e. the center
                return tf.exp(-0.5 * (((X - x0) / sigma)**2 +
                                      ((Y - y0) / sigma)**2))

            sigma = gt_config['rPos'] / stride / 3.0
            gt = _gaussian_2d(X, Y, sigma)
        elif gt_type == 'overlap':

            def _overlap_score(X, Y, stride, area):
                area_x, area_y = [tf.to_float(a) / stride for a in area]
                x_diff = (area_x - tf.abs(X))
                y_diff = (area_y - tf.abs(Y))

                # Intersection over union
                Z = x_diff * y_diff / (2 * area_x * area_y - x_diff * y_diff)

                # Remove negative intersections
                Z = tf.where(x_diff > 0, Z, tf.zeros_like(Z))
                Z = tf.where(y_diff > 0, Z, tf.zeros_like(Z))
                return Z

            area = [64, 64]
            logging.info('area are fixed for overlap gt type')
            gt = _overlap_score(X, Y, stride, area)
        elif gt_type == 'logistic':

            def _logistic_label(X, Y, rPos, rNeg):
                # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y))  # L2 dist
                dist_to_center = tf.abs(X) + tf.abs(Y)  # Block dist
                Z = tf.where(
                    dist_to_center <= rPos, tf.ones_like(X),
                    tf.where(dist_to_center < rNeg, 0.5 * tf.ones_like(X),
                             tf.zeros_like(X)))
                return Z

            rPos = gt_config['rPos'] / stride
            rNeg = gt_config['rNeg'] / stride
            gt = _logistic_label(X, Y, rPos, rNeg)
        else:
            raise NotImplementedError

            # Create a batch of ground truth response
        gt_expand = tf.reshape(gt, [1] + response_size)
        gt = tf.tile(gt_expand, [batch_size, 1, 1])
        return gt
Ejemplo n.º 13
0
    def track(self, sess, handle, logdir='/tmp'):
        """Runs tracking on a single image sequence.

    Args:
      sess: TensorFlow Session object.
      handle: a handle which generates image files and target pos in 1st frame,
        which mimic the interface of VOT.
    Returns:
      A list of Trajectories sorted by descending score.
    """
        # Get initial target bounding box and convert to center based
        bbox = handle.region()
        bbox = convert_bbox(bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        # Note we use different padding values for each image while the original implementation uses only the average value
        # of the first image for all images.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [handle.frame(), bbox_feed]
        frame2crop_scale = self.model.initialize(sess, input_feed)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        # If track first frame
        include_first = get(self.track_config, 'include_first', False)
        logging.info('tracking include first -- {}'.format(include_first))

        # Run tracking loop
        i = -1  # Processing the i th frame in image sequence,
        # note that we will use the first image twice in total.
        # 1. It is used to initialize the tracker
        # 2. It is used as a test example for tracker, the detected result won't affect the final metrics though.
        #    this is needed because both OTB and VOT benchmark require a list of tracking results equal to the
        #    length of the test image sequences including the first image.
        while True:
            # Read new image
            filename = handle.frame()
            if not filename:
                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])
                break  # All image files are processed, exiting while loop
            i += 1
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                # Prepare input feed
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                # Feed in input
                outputs, metadata = self.model.inference_step(sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    current_scale_idx = int(get_center(self.num_scales))
                    best_scale = current_scale_idx
                    best_peak = -np.inf
                    for s in range(self.num_scales):
                        this_response = response[s]
                        this_peak = np.max(this_response[:])

                        # Penalize change of scale
                        if s != current_scale_idx:
                            this_peak *= self.track_config['scale_penalty']
                        if this_peak > best_peak:
                            best_peak = this_peak
                            best_scale = s
                else:
                    best_scale = 0

                response = response[best_scale]

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                # Normalize response
                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    logging.debug('mean response: {}'.format(
                        np.mean(response)))
                    response = response - np.min(response)
                    response = response / np.sum(response)

                # Apply windowing
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window
                if self.log_level > 0:
                    np.save(
                        osp.join(logdir, 'response_windowed{}.npy'.format(i)),
                        response)

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(self.r_image_size)
                # ... in instance feature space ...
                upsampling_factor = self.r_image_size / self.r_embed_size
                disp_instance_feat = disp_instance_final / upsampling_factor
                # ... Avoid empty position ...
                r_radius = int(self.r_embed_size / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                       'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                       'target position in feature space should be no larger than input image size'
                logging.debug('search_position: {}'.format(
                    current_target_state.search_pos))

            # I used to put this at the beginning of the loop, which makes the code visually looks better.
            # But it is also more demanding to really understand the logic behind that and prone to make
            # bugs. My opinion now is *easy is better than concise, if you can't have both.*
            # Record tracked target position
            reported_bbox = convert_bbox(current_target_state.bbox,
                                         'top-left-based')
            handle.report(reported_bbox)