Ejemplo n.º 1
0
def construct_seg_score_maps(response_size, bboxes, im_size):
    """Construct a batch of groundtruth score maps

  Args:
    response_size: A list or tuple with two elements [ho, wo]
    bboxes: Labels for bounding boxes
    im_size: Image size

  Return:
    A float tensor of shape [batch_size] + response_size
  """
    with tf.name_scope('construct_gt'):
        ho = response_size[0]
        wo = response_size[1]
        y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho)
        x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo)
        [X, Y] = tf.meshgrid(x, y)

        def _logistic_label(Y, X, H, W):
            Y = tf.abs(Y)
            X = tf.abs(X)
            Z = tf.where(Y <= H * ho / im_size[0] / 2, tf.ones_like(Y),
                         tf.zeros_like(Y))
            Z = tf.where(X <= W * wo / im_size[1] / 2, Z, tf.zeros_like(X))
            return Z

        gt = tf.map_fn(lambda x: _logistic_label(Y, X, tf.to_float(x[0]),
                                                 tf.to_float(x[1])),
                       bboxes,
                       dtype=tf.float32)
        return gt
Ejemplo n.º 2
0
def get_exemplar_images(images, exemplar_size, targets_pos=None):
    """Crop exemplar image from input images"""
    with tf.name_scope('get_exemplar_image'):
        batch_size, x_height, x_width = images.get_shape().as_list()[:3]
        z_height, z_width = exemplar_size

        if targets_pos is None:
            target_pos_single = [[get_center(x_height), get_center(x_width)]]
            targets_pos_ = tf.tile(target_pos_single, [batch_size, 1])
        else:
            targets_pos_ = targets_pos

        # convert to top-left corner based coordinates
        top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height)))
        bottom = tf.to_int32(top + z_height)
        left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width)))
        right = tf.to_int32(left + z_width)

        def _slice(x):
            f, t, l, b, r = x
            c = f[t:b, l:r]
            return c

        exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right),
                                 dtype=images.dtype)
        exemplar_img.set_shape([batch_size, z_height, z_width, 3])
        return exemplar_img
Ejemplo n.º 3
0
def get_exemplar_images(images, exemplar_size, targets_pos=None):
  """Crop exemplar image from input images"""
  with tf.name_scope('get_exemplar_image'):
    batch_size, x_height, x_width = images.get_shape().as_list()[:3]
    z_height, z_width = exemplar_size

    if targets_pos is None:
      target_pos_single = [[get_center(x_height), get_center(x_width)]]
      targets_pos_ = tf.tile(target_pos_single, [batch_size, 1])
    else:
      targets_pos_ = targets_pos

    # convert to top-left corner based coordinates
    top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height)))
    bottom = tf.to_int32(top + z_height)
    left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width)))
    right = tf.to_int32(left + z_width)

    def _slice(x):
      f, t, l, b, r = x
      c = f[t:b, l:r]
      return c

    exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype)
    exemplar_img.set_shape([batch_size, z_height, z_width, 3])
    return exemplar_img
Ejemplo n.º 4
0
    def build_search_images(self):
        """Crop search images from the input image based on the last target position

    1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
    2. Crop an image patch as large as x_image_size centered at the target center.
    3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
    """

        size_z = 127
        size_x = 255
        context_amount = 0.5

        num_scales = 3
        scales = np.arange(num_scales) - get_center(num_scales)
        assert np.sum(scales) == 0, 'scales should be symmetric'
        search_factors = [1.0375**x for x in scales]

        frame_sz = tf.shape(self.image)
        target_yx = self.target_bbox_feed[0:2]
        target_size = self.target_bbox_feed[2:4]
        avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')

        # Compute base values
        base_z_size = target_size
        base_z_context_size = base_z_size + context_amount * tf.reduce_sum(
            base_z_size)
        base_s_z = tf.sqrt(
            tf.reduce_prod(base_z_context_size))  # Canonical size
        base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
        d_search = (size_x - size_z) / 2.0
        base_pad = tf.div(d_search, base_scale_z)
        base_s_x = base_s_z + 2 * base_pad
        base_scale_x = tf.div(tf.to_float(size_x), base_s_x)

        boxes = []
        for factor in search_factors:
            s_x = factor * base_s_x
            frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
            topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
            bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
            box = tf.concat([topleft, bottomright], axis=0)
            boxes.append(box)
        boxes = tf.stack(boxes)

        scale_xs = []
        for factor in search_factors:
            scale_x = base_scale_x / factor
            scale_xs.append(scale_x)
        self.scale_xs = tf.stack(scale_xs)

        # Note we use different padding values for each image
        # while the original implementation uses only the average value
        # of the first image for all images.
        image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
        image_cropped = tf.image.crop_and_resize(image_minus_avg,
                                                 boxes,
                                                 box_ind=tf.zeros((3),
                                                                  tf.int32),
                                                 crop_size=[size_x, size_x])
        self.search_images = image_cropped + avg_chan
  def build_search_images(self):
    """Crop search images from the input image based on the last target position

    1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
    2. Crop an image patch as large as x_image_size centered at the target center.
    3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
    """
    model_config = self.model_config
    track_config = self.track_config

    size_z = model_config['z_image_size']
    size_x = track_config['x_image_size']
    context_amount = 0.5

    num_scales = track_config['num_scales']
    scales = np.arange(num_scales) - get_center(num_scales)
    assert np.sum(scales) == 0, 'scales should be symmetric'
    search_factors = [track_config['scale_step'] ** x for x in scales]

    frame_sz = tf.shape(self.image)
    target_yx = self.target_bbox_feed[0:2]
    target_size = self.target_bbox_feed[2:4]
    avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')

    # Compute base values
    base_z_size = target_size
    base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size)
    base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size))  # Canonical size
    base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
    d_search = (size_x - size_z) / 2.0
    base_pad = tf.div(d_search, base_scale_z)
    base_s_x = base_s_z + 2 * base_pad
    base_scale_x = tf.div(tf.to_float(size_x), base_s_x)

    boxes = []
    for factor in search_factors:
      s_x = factor * base_s_x
      frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
      topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
      bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
      box = tf.concat([topleft, bottomright], axis=0)
      boxes.append(box)
    boxes = tf.stack(boxes)

    scale_xs = []
    for factor in search_factors:
      scale_x = base_scale_x / factor
      scale_xs.append(scale_x)
    self.scale_xs = tf.stack(scale_xs)

    # Note we use different padding values for each image
    # while the original implementation uses only the average value
    # of the first image for all images.
    image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
    image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes,
                                             box_ind=tf.zeros((track_config['num_scales']), tf.int32),
                                             crop_size=[size_x, size_x])
    self.search_images = image_cropped + avg_chan
Ejemplo n.º 6
0
  def build_search_images(self):
    """Crop search images from the input image based on the last target position
    1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
    2. Crop an image patch as large as x_image_size centered at the target center.
    3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
    """
    model_config = self.model_config
    track_config = self.track_config
    ratio = self.target_bbox_feed[2] / self.target_bbox_feed[3]  

    size_z = model_config['z_image_size']
    size_x = self.size_x_feed
    context_amount = 0.3
    
    num_scales = track_config['num_scales']
    scales = np.arange(num_scales) - get_center(num_scales)
    assert np.sum(scales) == 0, 'scales should be symmetric'
    search_factors = tf.split(self.scale_feed, 3)

    frame_sz = tf.shape(self.image)
    target_yx = self.target_bbox_feed[0:2]
    target_size = self.target_bbox_feed[2:4]
    avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')
    
    # Compute base values
    base_z_size = target_size
    base_z_context_size = base_z_size + context_amount * tf.reduce_sum(base_z_size)
    base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size))  # Canonical size
    base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
    d_search = tf.div(tf.to_float(size_x) - tf.to_float(size_z), 2.0)
    base_pad = tf.div(d_search, base_scale_z)
    base_s_x = base_s_z + 2 * base_pad
    base_scale_x = tf.div(tf.to_float(size_x), base_s_x)
    
    boxes = []
    for factor in search_factors:
      s_x = factor * base_s_x
      frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
      topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
      bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
      box = tf.concat([topleft, bottomright], axis=0)
      boxes.append(box)
    boxes = tf.stack(boxes)
    
    self.target_size = target_size * 127. / base_s_z    
    
    scale_xs = []
    for factor in search_factors:
      scale_x = base_scale_x / factor
      scale_xs.append(scale_x)
    self.scale_xs = tf.stack(scale_xs)

    # Pad with average value of the image
    image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
    image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes,
                                             box_ind=tf.zeros((track_config['num_scales']), tf.int32),
                                             crop_size=[size_x, size_x])
    self.search_images = image_cropped + avg_chan
Ejemplo n.º 7
0
def convert_bbox_format(bbox, to):
  x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
  if to == 'top-left-based':
    x -= get_center(target_width)
    y -= get_center(target_height)
  elif to == 'center-based':
    y += get_center(target_height)
    x += get_center(target_width)
  else:
    raise ValueError("Bbox format: {} was not recognized".format(to))
  return Rectangle(x, y, target_width, target_height)
Ejemplo n.º 8
0
def convert_bbox_format(bbox, to):
    x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
    if to == 'top-left-based':
        x -= get_center(target_width)
        y -= get_center(target_height)
    elif to == 'center-based':
        y += get_center(target_height)
        x += get_center(target_width)
    else:
        raise ValueError("Bbox format: {} was not recognized".format(to))
    return Rectangle(x, y, target_width, target_height)
Ejemplo n.º 9
0
    def build_examplar_ini(self):
        model_config = self.model_config
        track_config = self.track_config

        # Exemplar image lies at the center of the search image in the first frame
        exemplar_images = get_exemplar_images(
            self.search_images,
            [model_config['z_image_size'], model_config['z_image_size']])
        [templates_ini, templates] = self.get_image_embedding(exemplar_images)
        center_scale = int(get_center(track_config['num_scales']))
        center_template = tf.identity(templates[center_scale])
        templates = tf.stack(
            [center_template for _ in range(track_config['num_scales'])])

        center_template_ini = tf.identity(templates_ini[center_scale])
        templates_ini = tf.stack(
            [center_template_ini for _ in range(track_config['num_scales'])])

        with tf.variable_scope('target_examplar'):
            # Store template in Variable such that we don't have to feed this template every time.
            with tf.variable_scope('State'):
                state = tf.get_variable(
                    'exemplar',
                    initializer=tf.zeros(templates_ini.get_shape().as_list(),
                                         dtype=templates_ini.dtype),
                    trainable=False)
                with tf.control_dependencies([templates_ini]):
                    self.init_examplar = tf.assign(state,
                                                   templates_ini,
                                                   validate_shape=True)
                self.examplar_ini = state
Ejemplo n.º 10
0
def construct_gt_score_maps(response_size,
                            batch_size,
                            stride,
                            gt_config=None,
                            n_out=1):
    """Construct a batch of groundtruth score maps

  Args:
    response_size: A list or tuple with two elements [ho, wo]
    batch_size: An integer e.g., 16
    stride: Embedding stride e.g., 8
    gt_config: Configurations for groundtruth generation

  Return:
    A float tensor of shape [batch_size] + response_size
  """
    with tf.name_scope('construct_gt'):
        ho = response_size[0]
        wo = response_size[1]
        y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho)
        x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo)
        [Y, X] = tf.meshgrid(y, x)

        def _logistic_label(X, Y, rPos, rNeg):
            # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y))  # L2 metric
            dist_to_center = tf.abs(X) + tf.abs(Y)  # Block metric
            Z = tf.where(
                dist_to_center <= rPos, tf.ones_like(X),
                tf.where(dist_to_center < rNeg, 0.5 * tf.ones_like(X),
                         tf.zeros_like(X)))
            return Z

        rPos = gt_config['rPos'] / stride
        rNeg = gt_config['rNeg'] / stride
        gt = _logistic_label(X, Y, rPos, rNeg)

        # Duplicate a batch of maps
        if n_out > 1:
            gt_expand = tf.reshape(gt, [1] + response_size + [1])
            gt = tf.tile(gt_expand, [batch_size, 1, 1, n_out])
        else:
            gt_expand = tf.reshape(gt, [1] + response_size)
            gt = tf.tile(gt_expand, [batch_size, 1, 1])
        return gt
Ejemplo n.º 11
0
    def track_vot_init(self, sess, first_bbox, frame):
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frame, bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        self.vot_original_target_height = bbox.height
        self.vot_original_target_width = bbox.width
        self.vot_search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        self.vot_current_target_state = TargetState(
            bbox=bbox,
            search_pos=self.vot_search_center,
            scale_idx=int(get_center(self.num_scales)))
Ejemplo n.º 12
0
    def init(self, sess, frame, first_bbox, logdir='/tmp'):
        # Get initial target bounding box and convert to center based
        self.i = 0
        first_bbox = Rectangle(first_bbox[0], first_bbox[1], first_bbox[2],
                               first_bbox[3])
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [
            frame, bbox_feed, self.x_image_size_init, self.search_factors_init
        ]
        frame2crop_scale, self.image_z = self.siamese_model.initialize(
            sess, input_feed)
        imwrite(osp.join(logdir, 'aimagez.jpg'),
                cv2.cvtColor(self.image_z, cv2.COLOR_RGB2BGR))

        # Storing target state
        self.original_target_height = bbox.height
        self.original_target_width = bbox.width
        self.search_center = np.array([
            get_center(self.x_image_size_init),
            get_center(self.x_image_size_init)
        ])
        self.current_target_state = TargetState(
            bbox=bbox,
            search_pos=self.search_center,
            scale_idx=int(get_center(self.num_scales)))

        self.store_thresh = 0.9
        self.conf_thresh = 0.7
        self.bound_thresh = 0.5
        self.sup_thresh = 0.1

        self.mem_count = 0
        self.update_delay = 0
        self.lost = 0
        self.x_image_size = self.x_image_size_init
        self.image_c = None
        self.moved2border = False
        self.prev_score = self.conf_thresh + 0.01
        return True
Ejemplo n.º 13
0
    def initialize(self, sess, first_bbox, frame, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frame, bbox_feed]
        self.frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        self.original_target_height = bbox.height
        self.original_target_width = bbox.width
        self.search_center = np.array([get_center(self.x_image_size),
                                       get_center(self.x_image_size)])
        self.current_target_state = TargetState(bbox=bbox,
                                                search_pos=self.search_center,
                                                scale_idx=int(get_center(self.num_scales)))
        self.logdir = logdir
        self.frame_cnt = 0
Ejemplo n.º 14
0
    def __init__(self, siamese_model, model_config, track_config):
        self.siamese_model = siamese_model
        self.model_config = model_config
        self.track_config = track_config

        self.num_scales = track_config['num_scales']
        logging.info('track num scales -- {}'.format(self.num_scales))
        scales = np.arange(self.num_scales) - get_center(self.num_scales)
        self.search_factors_init = [
            self.track_config['scale_step']**x for x in scales
        ]
        scales_5 = np.arange(self.num_scales +
                             2) - get_center(self.num_scales + 2)
        self.search_factors_init5 = [
            self.track_config['scale_step']**x for x in scales_5
        ]

        self.x_image_size_init = track_config[
            'x_image_size']  # Search image size
        self.window = None  # Cosine window
        self.log_level = track_config['log_level']
Ejemplo n.º 15
0
    def build_examplar(self):
        model_config = self.model_config
        track_config = self.track_config

        # Exemplar image lies at the center of the search image in the first frame
        exemplar_images = get_exemplar_images(
            self.search_images,
            [model_config['z_image_size'], model_config['z_image_size']])
        [_, templates] = self.get_image_embedding(exemplar_images,
                                                  reuse=tf.AUTO_REUSE)
        center_scale = int(get_center(track_config['num_scales']))
        self.examplar = tf.identity(templates[center_scale])
        self.examplar = tf.nn.relu(self.examplar)
Ejemplo n.º 16
0
  def __init__(self, siamese_model, model_config, track_config):
    self.siamese_model = siamese_model
    self.model_config = model_config
    self.track_config = track_config

    self.num_scales = track_config['num_scales']
    logging.info('track num scales -- {}'.format(self.num_scales))
    scales = np.arange(self.num_scales) - get_center(self.num_scales)
    self.search_factors = [self.track_config['scale_step'] ** x for x in scales]

    self.x_image_size = track_config['x_image_size']  # Search image size
    self.window = None  # Cosine window
    self.log_level = track_config['log_level']
Ejemplo n.º 17
0
def construct_gt_score_maps(response_size, batch_size, stride, gt_config=None):
  """Construct a batch of groundtruth score maps

  Args:
    response_size: A list or tuple with two elements [ho, wo]
    batch_size: An integer e.g., 16
    stride: Embedding stride e.g., 8
    gt_config: Configurations for groundtruth generation

  Return:
    A float tensor of shape [batch_size] + response_size
  """
  with tf.name_scope('construct_gt'):
    ho = response_size[0]
    wo = response_size[1]
    y = tf.cast(tf.range(0, ho), dtype=tf.float32) - get_center(ho)
    x = tf.cast(tf.range(0, wo), dtype=tf.float32) - get_center(wo)
    [Y, X] = tf.meshgrid(y, x)

    def _logistic_label(X, Y, rPos, rNeg):
      # dist_to_center = tf.sqrt(tf.square(X) + tf.square(Y))  # L2 metric
      dist_to_center = tf.abs(X) + tf.abs(Y)  # Block metric
      Z = tf.where(dist_to_center <= rPos,
                   tf.ones_like(X),
                   tf.where(dist_to_center < rNeg,
                            0.5 * tf.ones_like(X),
                            tf.zeros_like(X)))
      return Z

    rPos = gt_config['rPos'] / stride
    rNeg = gt_config['rNeg'] / stride
    gt = _logistic_label(X, Y, rPos, rNeg)

    # Duplicate a batch of maps
    gt_expand = tf.reshape(gt, [1] + response_size)
    gt = tf.tile(gt_expand, [batch_size, 1, 1])
    return gt
Ejemplo n.º 18
0
def get_subwindow_avg(im, pos, model_sz, original_sz):
    # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower
    avg_chans = [
        np.mean(im[:, :, 0]),
        np.mean(im[:, :, 1]),
        np.mean(im[:, :, 2])
    ]
    if not original_sz:
        original_sz = model_sz
    sz = original_sz
    im_sz = im.shape
    # make sure the size is not too small
    assert im_sz[0] > 2 and im_sz[1] > 2
    c = [get_center(s) for s in sz]

    # check out-of-bounds coordinates, and set them to avg_chans
    context_xmin = np.int(np.round(pos[1] - c[1]))
    context_xmax = np.int(context_xmin + sz[1] - 1)
    context_ymin = np.int(np.round(pos[0] - c[0]))
    context_ymax = np.int(context_ymin + sz[0] - 1)
    left_pad = np.int(np.maximum(0, -context_xmin))
    top_pad = np.int(np.maximum(0, -context_ymin))
    right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))
    bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))

    context_xmin = context_xmin + left_pad
    context_xmax = context_xmax + left_pad
    context_ymin = context_ymin + top_pad
    context_ymax = context_ymax + top_pad
    if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
        R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[0]))
        G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[1]))
        B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
                   'constant',
                   constant_values=(avg_chans[2]))

        im = np.stack((R, G, B), axis=2)

    im_patch_original = im[context_ymin:context_ymax + 1,
                           context_xmin:context_xmax + 1, :]
    if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]):
        im_patch = resize(im_patch_original, tuple(model_sz))
    else:
        im_patch = im_patch_original
    return im_patch, left_pad, top_pad, right_pad, bottom_pad
Ejemplo n.º 19
0
def get_subwindow_avg(im, pos, model_sz, original_sz):
  # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower
  avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])]
  if not original_sz:
    original_sz = model_sz
  sz = original_sz
  im_sz = im.shape
  # make sure the size is not too small
  assert im_sz[0] > 2 and im_sz[1] > 2
  c = [get_center(s) for s in sz]

  # check out-of-bounds coordinates, and set them to avg_chans
  context_xmin = np.int(np.round(pos[1] - c[1]))
  context_xmax = np.int(context_xmin + sz[1] - 1)
  context_ymin = np.int(np.round(pos[0] - c[0]))
  context_ymax = np.int(context_ymin + sz[0] - 1)
  left_pad = np.int(np.maximum(0, -context_xmin))
  top_pad = np.int(np.maximum(0, -context_ymin))
  right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))
  bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))

  context_xmin = context_xmin + left_pad
  context_xmax = context_xmax + left_pad
  context_ymin = context_ymin + top_pad
  context_ymax = context_ymax + top_pad
  if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
    R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[0]))
    G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[1]))
    B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[2]))

    im = np.stack((R, G, B), axis=2)

  im_patch_original = im[context_ymin:context_ymax + 1,
                      context_xmin:context_xmax + 1, :]
  if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]):
    im_patch = resize(im_patch_original, tuple(model_sz))
  else:
    im_patch = im_patch_original
  return im_patch, left_pad, top_pad, right_pad, bottom_pad
Ejemplo n.º 20
0
    def __init__(self, siamese_model, model_config, track_config):
        self.siamese_model = siamese_model
        self.model_config = model_config
        self.track_config = track_config

        self.num_scales = track_config['num_scales']
        logging.info('track num scales -- {}'.format(self.num_scales))
        scales = np.arange(self.num_scales) - get_center(self.num_scales)
        self.search_factors = [
            self.track_config['scale_step']**x for x in scales
        ]

        self.x_image_size = track_config['x_image_size']  # Search image size
        self.window = None  # Cosine window
        self.log_level = track_config['log_level']

        self.vot_original_target_height = None
        self.vot_original_target_width = None
        self.vot_current_target_state = None
        self.vot_search_center = None
Ejemplo n.º 21
0
def get_bg_images(images, exemplar_size, original_size):
    x_height, x_width = original_size
    z_height, z_width = exemplar_size
    exem_r = int(get_center(z_height))
    topleft = get_exemplar_images(images,
                                  exemplar_size,
                                  targets_pos=np.array([[exem_r, exem_r]]))
    topright = get_exemplar_images(images,
                                   exemplar_size,
                                   targets_pos=np.array(
                                       [[exem_r, x_width - exem_r - 1]]))
    bottomleft = get_exemplar_images(images,
                                     exemplar_size,
                                     targets_pos=np.array(
                                         [[x_height - exem_r - 1, exem_r]]))
    bottomright = get_exemplar_images(
        images,
        exemplar_size,
        targets_pos=np.array([[x_height - exem_r - 1, x_width - exem_r - 1]]))
    return tf.concat([topleft, topright, bottomleft, bottomright], 0)
Ejemplo n.º 22
0
  def build_template(self):
    model_config = self.model_config
    track_config = self.track_config

    # Exemplar image lies at the center of the search image in the first frame
    exemplar_images = get_exemplar_images(self.search_images, [model_config['z_image_size'],
                                                               model_config['z_image_size']])
    templates = self.get_image_embedding(exemplar_images)
    center_scale = int(get_center(track_config['num_scales']))
    center_template = tf.identity(templates[center_scale])
    templates = tf.stack([center_template for _ in range(track_config['num_scales'])])

    with tf.variable_scope('target_template'):
      # Store template in Variable such that we don't have to feed this template every time.
      with tf.variable_scope('State'):
        state = tf.get_variable('exemplar',
                                initializer=tf.zeros(templates.get_shape().as_list(), dtype=templates.dtype),
                                trainable=False)
        with tf.control_dependencies([templates]):
          self.init = tf.assign(state, templates, validate_shape=True)
        self.templates = state
Ejemplo n.º 23
0
    def build_template(self):

        # Exemplar image lies at the center of the search image in the first frame
        exemplar_images = get_exemplar_images(self.search_images, [127, 127])
        templates = self.get_image_embedding(exemplar_images, self.classid)
        center_scale = int(get_center(3))
        center_template = tf.identity(templates[center_scale])
        templates = tf.stack([center_template for _ in range(3)])

        with tf.variable_scope('target_template'):
            # Store template in Variable such that we don't have to feed this template every time.
            with tf.variable_scope('State'):
                state = tf.get_variable('exemplar',
                                        initializer=tf.zeros(
                                            templates.get_shape().as_list(),
                                            dtype=templates.dtype),
                                        trainable=False)
                with tf.control_dependencies([templates]):
                    self.init = tf.assign(state,
                                          templates,
                                          validate_shape=True)
                self.templates = state
Ejemplo n.º 24
0
    def build_template(self):
        model_config = self.model_config
        track_config = self.track_config
        size_z = model_config['z_image_size']
        ratio = self.target_size[0] / self.target_size[1]

        # Exemplar image lies at the center of the search image in the first frame
        center_scale = int(get_center(track_config['num_scales']))
        search_images = tf.expand_dims(self.search_images[center_scale], 0)
        exemplar_images = get_exemplar_images(
            search_images, [size_z, size_z],
            np.array([[
                get_center(track_config['x_image_size']),
                get_center(track_config['x_image_size'])
            ]]))

        def boundary_suppression(embeds, embeds2, ratio):
            offsets = tf.cond(
                tf.greater(ratio, 1.5), lambda: [0, 4, 0, 4],
                lambda: tf.cond(tf.less(ratio, 0.67), lambda: [4, 0, 4, 0],
                                lambda: [2, 2, 2, 2]))
            embeds = tf.image.resize_image_with_crop_or_pad(
                embeds, t_shape[1] - offsets[0], t_shape[2] - offsets[1])
            embeds = tf.image.resize_image_with_crop_or_pad(
                embeds, t_shape[1], t_shape[2])
            embeds2 = tf.image.resize_image_with_crop_or_pad(
                embeds2, t_shape2[1] - offsets[2], t_shape2[2] - offsets[3])
            embeds2 = tf.image.resize_image_with_crop_or_pad(
                embeds2, t_shape2[1], t_shape2[2])
            return embeds, embeds2

        def background_suppression(embeds, ratio):
            offsets = tf.cond(
                tf.greater(ratio, 1.5),  # 1.2 / 0.83; 1.5 / 0.67
                lambda: [1., 1.2 / ratio],
                lambda: tf.cond(
                    tf.less(ratio, 0.67), lambda: [1.2 * ratio, 1.], lambda: tf
                    .cond(
                        tf.greater(ratio, 1.2), lambda: [1., 1.1 / ratio],
                        lambda: tf.cond(tf.less(ratio, 0.83), lambda: [
                            1.1 * ratio, 1.
                        ], lambda: [0.7, 0.7]))))

            h = tf.cast(size_z * offsets[0], tf.int32)
            w = tf.cast(size_z * offsets[1], tf.int32)

            embeds_mean = tf.reduce_mean(embeds, axis=(0, 1), keepdims=True)
            embeds = embeds - embeds_mean
            embeds = tf.image.resize_image_with_crop_or_pad(embeds, h, w)
            embeds = tf.image.resize_image_with_crop_or_pad(
                embeds, size_z, size_z)
            return embeds + embeds_mean

        exemplar_images = tf.map_fn(
            lambda x: background_suppression(x[0], x[1]),
            (exemplar_images, tf.expand_dims(ratio, 0)),
            dtype=exemplar_images.dtype)
        self.exemplar_images = exemplar_images

        templates, templates2 = self.get_image_embedding(exemplar_images)
        t_shape = templates.get_shape().as_list()
        t_shape2 = templates2.get_shape().as_list()

        templates, templates2 = tf.map_fn(
            lambda x: boundary_suppression(x[0], x[1], x[2]),
            (templates, templates2, tf.expand_dims(ratio, 0)),
            dtype=(templates.dtype, templates2.dtype))

        templates = templates
        templates2 = templates2

        with tf.variable_scope('target_template'):
            # Store template in Variable such that we don't have to feed this template every time.
            with tf.variable_scope('State'):
                state = tf.get_variable('exemplar',
                                        initializer=tf.zeros(
                                            templates.get_shape().as_list(),
                                            dtype=templates.dtype),
                                        trainable=False)
                state2 = tf.get_variable('exemplar2',
                                         initializer=tf.zeros(
                                             templates2.get_shape().as_list(),
                                             dtype=templates2.dtype),
                                         trainable=False)
                with tf.control_dependencies([templates, templates2]):
                    self.init = tf.assign(state,
                                          templates,
                                          validate_shape=True)
                    self.init2 = tf.assign(state2,
                                           templates2,
                                           validate_shape=True)

                self.templates = state
                self.templates2 = state2

                # Store Pseudo Templates
                def _euc_distance(x, z):
                    z = tf.expand_dims(z, 0)
                    return tf.reduce_sum(tf.abs(x - z), -1)

                num_k = 3  # 3
                state_pseu = []
                state_pseu2 = []
                image_pseu = []
                self.init_pseu = []
                self.init2_pseu = []
                self.init_pseu_img = []
                for i in range(num_k):
                    state_pseu.append(
                        tf.get_variable('exemplar_pseu' + str(i),
                                        initializer=tf.zeros(
                                            templates.get_shape().as_list(),
                                            dtype=templates.dtype),
                                        trainable=False))
                    state_pseu2.append(
                        tf.get_variable('exemplar2_pseu' + str(i),
                                        initializer=tf.zeros(
                                            templates2.get_shape().as_list(),
                                            dtype=templates2.dtype),
                                        trainable=False))
                    image_pseu.append(
                        tf.get_variable(
                            'exemplar_pseu_image' + str(i),
                            initializer=tf.zeros(
                                exemplar_images.get_shape().as_list(),
                                dtype=exemplar_images.dtype),
                            trainable=False))
                    with tf.control_dependencies(
                        [templates, templates2, exemplar_images]):
                        self.init_pseu.append(
                            tf.assign(state_pseu[i],
                                      templates,
                                      validate_shape=True))
                        self.init2_pseu.append(
                            tf.assign(state_pseu2[i],
                                      templates2,
                                      validate_shape=True))
                        self.init_pseu_img.append(
                            tf.assign(image_pseu[i],
                                      exemplar_images,
                                      validate_shape=True))

                self.image_pseu = image_pseu
                self.pseu_temp = state_pseu
                self.pseu_temp2 = state_pseu2

                state_pseus = tf.concat([self.templates] + state_pseu +
                                        [templates], 0)
                sp_shape = state_pseus.get_shape().as_list()[0]
                state_pseus_c = tf.reshape(state_pseus, [sp_shape, -1])
                state_pseus_dis = tf.map_fn(
                    lambda x: _euc_distance(state_pseus_c, x),
                    state_pseus_c,
                    dtype=state_pseus_c.dtype)
                state_pseus_dis = tf.reshape(state_pseus_dis,
                                             [sp_shape, sp_shape])[1:, :]
                state_pseus_dis = tf.reduce_sum(state_pseus_dis, -1)
                self.state_pseus_dis = state_pseus_dis
                _, state_pseus_idx = tf.nn.top_k(state_pseus_dis,
                                                 k=len(state_pseu))

                image_pseu_extra = tf.concat(image_pseu + [exemplar_images], 0)
                state_pseus2 = tf.concat(state_pseu2 + [templates2], 0)
                self.up_img = []
                self.up_pseu = []
                self.up2_pseu = []
                for i in range(len(state_pseu)):
                    with tf.control_dependencies([
                            state_pseus_idx, image_pseu_extra, state_pseus,
                            state_pseus2
                    ]):
                        self.up_pseu.append(
                            tf.assign(state_pseu[i],
                                      tf.expand_dims(
                                          state_pseus[state_pseus_idx[i] + 1],
                                          0),
                                      validate_shape=True))
                        self.up2_pseu.append(
                            tf.assign(state_pseu2[i],
                                      tf.expand_dims(
                                          state_pseus2[state_pseus_idx[i]], 0),
                                      validate_shape=True))
                        self.up_img.append(
                            tf.assign(image_pseu[i],
                                      tf.expand_dims(
                                          image_pseu_extra[state_pseus_idx[i]],
                                          0),
                                      validate_shape=True))
Ejemplo n.º 25
0
def main(_):
    # load model
    model_config, _, track_config = load_cfgs(CHECKPOINT)
    track_config["log_level"] = 0
    track_config["is_video"] = True

    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(model_config, track_config,
                                                   CHECKPOINT)
    g.finalize()

    if not os.path.isdir(track_config['log_dir']):
        tf.logging.info('Creating inference directory: %s',
                        track_config['log_dir'])
        mkdir_p(track_config['log_dir'])

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)
    with tf.Session(graph=g, config=sess_config) as sess:
        restore_fn(sess)
        tracker = Tracker(model,
                          model_config=model_config,
                          track_config=track_config)
        video_name = os.path.basename(FLAGS.video_path)
        video_log_dir = os.path.join(track_config["log_dir"], video_name)
        mkdir_p(video_log_dir)

        if str(FLAGS.video_path) in ["0", "1"]:
            # read from camera
            video_path = int(FLAGS.video_path)
            with_camera = True
        else:
            # read from video
            video_path = glob(os.path.join(FLAGS.video_path, "*.mp4"))[0]
            with_camera = False

        video_capture = cv2.VideoCapture(video_path)

        bb = [-1, -1, -1, -1]
        cv2.namedWindow("template")
        cv2.setMouseCallback("template", draw_init_box, bb)

        trajectory = []
        f_count = 0
        f_rate = 0
        start_time = time.time()
        while True:
            # capture frame by frame
            ret_, frame = video_capture.read()
            if ret_ == False:
                continue
            f_width, f_height = [
                int(a) for a in FLAGS.video_resolution.split("*")
            ]
            try:
                o_frame = cv2.resize(frame, (f_width, f_height),
                                     interpolation=cv2.INTER_CUBIC)
            except:
                break
            i_frame = cv2.cvtColor(o_frame, cv2.COLOR_BGR2RGB)

            # cv2.imwrite("test.jpg",o_frame)
            # pdb.set_trace()

            if f_count == 0:  # initialize the tracker
                # wait for drawing init box
                while True:
                    init_frame = o_frame.copy()
                    cv2.imshow("template", init_frame)
                    k = cv2.waitKey(0)
                    if k == 32:  # space
                        cx = int((bb[0] + bb[2]) / 2)
                        cy = int((bb[1] + bb[3]) / 2)
                        w = int(bb[2] - bb[0])
                        h = int(bb[3] - bb[1])
                        # Rectangle: [x,y,width,height]
                        init_bb = Rectangle(cx - 1, cy - 1, w,
                                            h)  # 0-index in python
                        draw_box(init_frame, init_bb, "exemplar")
                        break

                first_box = convert_bbox_format(init_bb, "center-based")
                bbox_feed = [
                    first_box.y, first_box.x, first_box.height, first_box.width
                ]
                input_feed = [i_frame, bbox_feed]
                frame2crop_scale = tracker.siamese_model.initialize(
                    sess, input_feed)
                # Storing target state
                original_target_height = first_box.height
                original_target_width = first_box.width
                search_center = np.array([
                    get_center(tracker.x_image_size),
                    get_center(tracker.x_image_size)
                ])
                current_target_state = TargetState(
                    bbox=first_box,
                    search_pos=search_center,
                    scale_idx=int(get_center(tracker.num_scales)))
                # setup initialized params
                current_param = {
                    "original_target_width": original_target_width,
                    "original_target_height": original_target_height,
                    "search_center": search_center,
                    "current_target_state": current_target_state
                }

            bbox, current_param = tracker.track_frame(sess, i_frame,
                                                      current_param,
                                                      video_log_dir)
            # add overlays
            end_time = time.time()
            f_rate = int(1 / (end_time - start_time))
            start_time = time.time()
            draw_box(o_frame, bbox)
            cv2.putText(o_frame,
                        str(f_rate) + "fps", (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1, (0, 0, 255),
                        thickness=2,
                        lineType=2)

            trajectory.append(bbox)
            f_count += 1

            cv2.imshow("Real-time Ouput", o_frame)
            cv2.imshow("template", init_frame)
            # if f_count > 30:
            #     cv2.imwrite("test.jpg",o_frame)
            #     pdb.set_trace()
            if cv2.waitKey(1) & 0xFF == ord("q"):
                cv2.imwrite("./assets/instance.jpg", o_frame)
                cv2.imwrite("./assets/exemplar.jpg", init_frame)
                break

        video_capture.release()
        cv2.destroyAllWindows()

        # save track results
        # pdb.set_trace()
        with open(os.path.join(video_log_dir, "track_rect.txt"), "w") as f:
            for region in trajectory:
                rect_str = "{},{},{},{}\n".format(region.x + 1, region.y + 1,
                                                  region.width, region.height)
                f.write(rect_str)
Ejemplo n.º 26
0
    def build_search_images(self):
        """Crop search images from the input image based on the last target position

        1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
        2. Crop an image patch as large as x_image_size centered at the target center.
        3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
        """
        model_config = self.model_config
        track_config = self.track_config

        size_z = model_config['z_image_size']
        size_x = track_config['x_image_size']
        context_amount = 0.5

        num_scales = track_config['num_scales']
        scales = np.arange(num_scales) - get_center(num_scales)
        assert np.sum(scales) == 0, 'scales should be symmetric'
        search_factors = [track_config['scale_step']**x for x in scales]

        frame_sz = tf.shape(self.image)
        target_yx = self.target_bbox_feed[0:2]
        target_size = self.target_bbox_feed[2:4]
        avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')

        # Compute base values
        base_z_size = target_size
        base_z_context_size = base_z_size + context_amount * tf.reduce_sum(
            base_z_size)
        base_s_z = tf.sqrt(
            tf.reduce_prod(base_z_context_size))  # Canonical size
        base_scale_z = tf.div(tf.to_float(size_z), base_s_z)
        d_search = (size_x - size_z) / 2.0
        base_pad = tf.div(d_search, base_scale_z)
        base_s_x = base_s_z + 2 * base_pad
        base_scale_x = tf.div(tf.to_float(size_x), base_s_x)

        # Note we use different padding values for each image
        # while the original implementation uses only the average value
        # of the first image for all images.
        image_minus_avg = self.image - avg_chan

        # for original implementation, fail on TX2
        #
        # image_minus_avg = tf.expand_dims(image_minus_avg, 0)
        # boxes = []
        # for factor in search_factors:
        #     s_x = factor * base_s_x
        #     frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
        #     topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
        #     bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
        #     box = tf.concat([topleft, bottomright], axis=0)
        #     boxes.append(box)
        # boxes = tf.stack(boxes)
        # image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes,
        #                                          box_ind=tf.zeros((track_config['num_scales']), tf.int32),
        #                                          crop_size=[size_x, size_x])

        def pad_frame(im, frame_sz, topleft, bottomright):
            xleft_pad = tf.maximum(0, -tf.cast(tf.round(topleft[1]), tf.int32))
            ytop_pad = tf.maximum(0, -tf.cast(tf.round(topleft[0]), tf.int32))
            xright_pad = tf.maximum(
                0,
                tf.cast(tf.round(bottomright[1]), tf.int32) - frame_sz[1])
            ybottom_pad = tf.maximum(
                0,
                tf.cast(tf.round(bottomright[0]), tf.int32) - frame_sz[0])
            npad = tf.reduce_max(
                [xleft_pad, ytop_pad, xright_pad, ybottom_pad])
            paddings = [[npad, npad], [npad, npad], [0, 0]]
            im_padded = im
            im_padded = tf.pad(im_padded,
                               paddings,
                               mode='CONSTANT',
                               constant_values=0)
            return im_padded, npad

        def extract_crops(im, npad, topleft, bottomright):
            # get top-right corner of bbox and consider padding
            tr_x = npad + tf.cast(tf.round(topleft[1]), tf.int32)
            # Compute size from rounded co-ords to ensure rectangle lies inside padding.
            tr_y = npad + tf.cast(tf.round(topleft[0]), tf.int32)
            width = tf.round(bottomright[1]) - tf.round(topleft[1])
            height = tf.round(bottomright[0]) - tf.round(topleft[0])
            crop = tf.image.crop_to_bounding_box(im, tf.cast(tr_y, tf.int32),
                                                 tf.cast(tr_x, tf.int32),
                                                 tf.cast(height, tf.int32),
                                                 tf.cast(width, tf.int32))
            # crop = tf.image.resize_images(crop, [sz_dst, sz_dst], method=tf.image.ResizeMethod.BILINEAR)
            # crops = tf.expand_dims(crop, axis=0)
            return crop

        image_cropped = []
        for factor in search_factors:
            s_x = factor * base_s_x
            frame_sz = tf.to_int32(frame_sz[0:2])
            topleft = target_yx - get_center(s_x)
            bottomright = target_yx + get_center(s_x)

            image_crop, npad = pad_frame(image_minus_avg, frame_sz, topleft,
                                         bottomright)
            image_crop = extract_crops(image_crop, npad, topleft, bottomright)
            image_crop = tf.image.resize_images(
                image_crop, [size_x, size_x],
                method=tf.image.ResizeMethod.BILINEAR)
            image_cropped.append(image_crop)

        image_cropped = tf.stack(image_cropped)

        scale_xs = []
        for factor in search_factors:
            scale_x = base_scale_x / factor
            scale_xs.append(scale_x)
        self.scale_xs = tf.stack(scale_xs, name='out_scale_xs')

        self.debug = image_cropped
        self.search_images = tf.add(image_cropped,
                                    avg_chan,
                                    name="out_search_images")
Ejemplo n.º 27
0
    def build_template(self):
        model_config = self.model_config
        track_config = self.track_config

        # Exemplar image lies at the center of the search image in the first frame
        exemplar_images = get_exemplar_images(
            self.search_images,
            [model_config['z_image_size'], model_config['z_image_size']])
        templates_s_c5, templates_s_c4, templates_s_c3, templates_a_c5, templates_a_c4, templates_a_c3 = self.get_image_embedding(
            exemplar_images)
        # =============================================================================
        #     templates_s_c5, templates_s_c4, templates_s_c3 = self.get_image_embedding(exemplar_images)
        # =============================================================================
        # =============================================================================
        #     templates_a_c5, templates_a_c4, templates_a_c3 = self.get_image_embedding(exemplar_images)
        # =============================================================================
        center_scale = int(get_center(track_config['num_scales']))

        center_template_s_c5 = tf.identity(templates_s_c5[center_scale])
        center_template_s_c4 = tf.identity(templates_s_c4[center_scale])
        center_template_s_c3 = tf.identity(templates_s_c3[center_scale])
        templates_s_c5 = tf.stack(
            [center_template_s_c5 for _ in range(track_config['num_scales'])])
        templates_s_c4 = tf.stack(
            [center_template_s_c4 for _ in range(track_config['num_scales'])])
        templates_s_c3 = tf.stack(
            [center_template_s_c3 for _ in range(track_config['num_scales'])])

        center_template_a_c5 = tf.identity(templates_a_c5[center_scale])
        center_template_a_c4 = tf.identity(templates_a_c4[center_scale])
        center_template_a_c3 = tf.identity(templates_a_c3[center_scale])
        templates_a_c5 = tf.stack(
            [center_template_a_c5 for _ in range(track_config['num_scales'])])
        templates_a_c4 = tf.stack(
            [center_template_a_c4 for _ in range(track_config['num_scales'])])
        templates_a_c3 = tf.stack(
            [center_template_a_c3 for _ in range(track_config['num_scales'])])

        with tf.variable_scope('target_template'):
            # Store template in Variable such that we don't have to feed this template every time.
            with tf.variable_scope('State'):
                state_s_c5 = tf.get_variable(
                    'exemplar_s_c5',
                    initializer=tf.zeros(templates_s_c5.get_shape().as_list(),
                                         dtype=templates_s_c5.dtype),
                    trainable=False)
                state_s_c4 = tf.get_variable(
                    'exemplar_s_c4',
                    initializer=tf.zeros(templates_s_c4.get_shape().as_list(),
                                         dtype=templates_s_c4.dtype),
                    trainable=False)
                state_s_c3 = tf.get_variable(
                    'exemplar_s_c3',
                    initializer=tf.zeros(templates_s_c3.get_shape().as_list(),
                                         dtype=templates_s_c3.dtype),
                    trainable=False)

                state_a_c5 = tf.get_variable(
                    'exemplar_a_c5',
                    initializer=tf.zeros(templates_a_c5.get_shape().as_list(),
                                         dtype=templates_a_c5.dtype),
                    trainable=False)
                state_a_c4 = tf.get_variable(
                    'exemplar_a_c4',
                    initializer=tf.zeros(templates_a_c4.get_shape().as_list(),
                                         dtype=templates_a_c4.dtype),
                    trainable=False)
                state_a_c3 = tf.get_variable(
                    'exemplar_a_c3',
                    initializer=tf.zeros(templates_a_c3.get_shape().as_list(),
                                         dtype=templates_a_c3.dtype),
                    trainable=False)

                with tf.control_dependencies([templates_s_c5]):
                    self.init_s_c5 = tf.assign(state_s_c5,
                                               templates_s_c5,
                                               validate_shape=True)
                with tf.control_dependencies([templates_s_c4]):
                    self.init_s_c4 = tf.assign(state_s_c4,
                                               templates_s_c4,
                                               validate_shape=True)
                with tf.control_dependencies([templates_s_c3]):
                    self.init_s_c3 = tf.assign(state_s_c3,
                                               templates_s_c3,
                                               validate_shape=True)

                with tf.control_dependencies([templates_a_c5]):
                    self.init_a_c5 = tf.assign(state_a_c5,
                                               templates_a_c5,
                                               validate_shape=True)
                with tf.control_dependencies([templates_a_c4]):
                    self.init_a_c4 = tf.assign(state_a_c4,
                                               templates_a_c4,
                                               validate_shape=True)
                with tf.control_dependencies([templates_a_c3]):
                    self.init_a_c3 = tf.assign(state_a_c3,
                                               templates_a_c3,
                                               validate_shape=True)

                self.templates_s_c5 = state_s_c5
                self.templates_s_c4 = state_s_c4
                self.templates_s_c3 = state_s_c3

                self.templates_a_c5 = state_a_c5
                self.templates_a_c4 = state_a_c4
                self.templates_a_c3 = state_a_c3
Ejemplo n.º 28
0
  def track(self, sess, first_bbox, frames, logdir='/tmp'):
    """Runs tracking on a single image sequence."""
    # Get initial target bounding box and convert to center based
    bbox = convert_bbox_format(first_bbox, 'center-based')

    # Feed in the first frame image to set initial state.
    bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
    input_feed = [frames[0], bbox_feed]
    frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

    # Storing target state
    original_target_height = bbox.height
    original_target_width = bbox.width
    search_center = np.array([get_center(self.x_image_size),
                              get_center(self.x_image_size)])
    current_target_state = TargetState(bbox=bbox,
                                       search_pos=search_center,
                                       scale_idx=int(get_center(self.num_scales)))

    include_first = get(self.track_config, 'include_first', False)
    logging.info('Tracking include first -- {}'.format(include_first))

    # Run tracking loop
    reported_bboxs = []
    for i, filename in enumerate(frames):
      if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
        bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                     current_target_state.bbox.height, current_target_state.bbox.width]
        input_feed = [filename, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
          response_max = np.max(response, axis=(1, 2))
          penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
          current_scale_idx = int(get_center(self.num_scales))
          penalties[current_scale_idx] = 1.0
          response_penalized = response_max * penalties
          best_scale = np.argmax(response_penalized)
        else:
          best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
          response = response - np.min(response)
          response = response / np.sum(response)

        if self.window is None:
          window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                          np.expand_dims(np.hanning(response_size), 0))
          self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(),
                                        response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
        # Position within frame in frame coordinates
        y = current_target_state.bbox.y
        x = current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = current_target_state.bbox.height / original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config['scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = original_target_height * target_scale
        width = original_target_width * target_scale
        current_target_state.bbox = Rectangle(x, y, width, height)
        current_target_state.scale_idx = best_scale
        current_target_state.search_pos = search_center + disp_instance_input

        assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'
        assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'

        if self.log_level > 0:
          np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

          # Select the image with the highest score scale and convert it to uint8
          image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
          # Note that imwrite in cv2 assumes the image is in BGR format.
          # However, the cropped image returned by TensorFlow is RGB.
          # Therefore, we convert color format using cv2.cvtColor
          imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                  cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

          np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
          np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

          y_search, x_search = current_target_state.search_pos
          search_scale = search_scale_list[best_scale]
          target_height_search = height * search_scale
          target_width_search = width * search_scale
          bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
          bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
          np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                  [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])

      reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based')
      reported_bboxs.append(reported_bbox)
    return reported_bboxs
Ejemplo n.º 29
0
    def build_template(self):
        model_config = self.model_config
        track_config = self.track_config
        size_z = model_config['z_image_size']
        ratio = self.target_bbox_feed[2] / self.target_bbox_feed[3]

        # Exemplar image lies at the center of the search image in the first frame
        search_images = self.search_images
        shape = search_images.get_shape().as_list()
        exemplar_images = get_exemplar_images(
            search_images, [size_z, size_z],
            tf.tile([[
                get_center(track_config['x_image_size']),
                get_center(track_config['x_image_size'])
            ]], [shape[0], 1]))
        center_scale = int(get_center(track_config['num_scales']))
        exemplar_images = tf.expand_dims(exemplar_images[center_scale], 0)

        def background_suppression(embeds, embeds2, ratio):
            offsets = tf.cond(
                tf.greater(ratio, 1.5), lambda: [0, 2, 0],
                lambda: tf.cond(tf.less(ratio, 0.66), lambda: [2, 0, 0],
                                lambda: [1, 1, 0]))
            embeds = tf.image.resize_image_with_crop_or_pad(
                embeds, t_shape[1] - offsets[0], t_shape[2] - offsets[1])
            embeds = tf.image.resize_image_with_crop_or_pad(
                embeds, t_shape[1], t_shape[2])
            embeds2 = tf.image.resize_image_with_crop_or_pad(
                embeds2, t_shape2[1] - offsets[0] * 2,
                t_shape2[2] - offsets[1] * 2)
            embeds2 = tf.image.resize_image_with_crop_or_pad(
                embeds2, t_shape2[1], t_shape2[2])
            return embeds, embeds2

        self.exemplar_images = exemplar_images
        templates, templates2 = self.get_image_embedding(exemplar_images)
        t_shape = templates.get_shape().as_list()
        t_shape2 = templates2.get_shape().as_list()

        templates, templates2 = tf.map_fn(
            lambda x: background_suppression(x[0], x[1], x[2]),
            (templates, templates2, tf.expand_dims(ratio, 0)),
            dtype=(templates.dtype, templates2.dtype))

        with tf.variable_scope('target_template'):
            # Store template in Variable such that we don't have to feed this template every time.
            with tf.variable_scope('State'):
                state = tf.get_variable('exemplar',
                                        initializer=tf.zeros(
                                            templates.get_shape().as_list(),
                                            dtype=templates.dtype),
                                        trainable=False)
                state2 = tf.get_variable('exemplar2',
                                         initializer=tf.zeros(
                                             templates2.get_shape().as_list(),
                                             dtype=templates2.dtype),
                                         trainable=False)
                with tf.control_dependencies([templates, templates2]):
                    self.init = tf.assign(state,
                                          templates,
                                          validate_shape=True)
                    self.init2 = tf.assign(state2,
                                           templates2,
                                           validate_shape=True)

                self.templates = state
                self.templates2 = state2

                # Store Pseudo Templates
                def _euc_distance(x, z):
                    z = tf.expand_dims(z, 0)
                    return tf.reduce_sum(tf.abs(x - z), -1)

                n_mem = 5
                temp1 = tf.concat([templates for _ in range(n_mem)], 0)
                temp2 = tf.concat([templates2 for _ in range(n_mem)], 0)
                temp3 = tf.concat([exemplar_images for _ in range(n_mem)], 0)
                state_mem1 = tf.get_variable('exemplar_mem',
                                             initializer=tf.zeros(
                                                 temp1.get_shape().as_list(),
                                                 dtype=temp1.dtype),
                                             trainable=False)
                state_mem2 = tf.get_variable('exemplar2_mem',
                                             initializer=tf.zeros(
                                                 temp2.get_shape().as_list(),
                                                 dtype=temp2.dtype),
                                             trainable=False)
                image_mem = tf.get_variable('exemplar_image_mem',
                                            initializer=tf.zeros(
                                                temp3.get_shape().as_list(),
                                                dtype=temp3.dtype),
                                            trainable=False)
                with tf.control_dependencies([temp1, temp2, temp3]):
                    self.init_mem = tf.assign(state_mem1,
                                              temp1,
                                              validate_shape=True)
                    self.init_mem2 = tf.assign(state_mem2,
                                               temp2,
                                               validate_shape=True)
                    self.init_img_mem = tf.assign(image_mem,
                                                  temp3,
                                                  validate_shape=True)

                up_mem = tf.scatter_update(state_mem1, self.mem_id_feed,
                                           templates[0])
                up_mem2 = tf.scatter_update(state_mem2, self.mem_id_feed,
                                            templates2[0])
                up_img_mem = tf.scatter_update(image_mem, self.mem_id_feed,
                                               exemplar_images[0])
                with tf.control_dependencies([up_mem, up_mem2, up_img_mem]):
                    self.up_mem = up_mem
                    self.up_mem2 = up_mem2
                    self.up_img_mem = up_img_mem

                state_pseu = []
                state_pseu2 = []
                image_pseu = []
                self.init_pseu = []
                self.init2_pseu = []
                self.init_pseu_img = []
                for i in range(3):
                    state_pseu.append(
                        tf.get_variable('exemplar_pseu' + str(i),
                                        initializer=tf.zeros(
                                            templates.get_shape().as_list(),
                                            dtype=templates.dtype),
                                        trainable=False))
                    state_pseu2.append(
                        tf.get_variable('exemplar2_pseu' + str(i),
                                        initializer=tf.zeros(
                                            templates2.get_shape().as_list(),
                                            dtype=templates2.dtype),
                                        trainable=False))
                    image_pseu.append(
                        tf.get_variable(
                            'exemplar_pseu_image' + str(i),
                            initializer=tf.zeros(
                                exemplar_images.get_shape().as_list(),
                                dtype=exemplar_images.dtype),
                            trainable=False))
                    with tf.control_dependencies(
                        [templates, templates2, exemplar_images]):
                        self.init_pseu.append(
                            tf.assign(state_pseu[i],
                                      templates,
                                      validate_shape=True))
                        self.init2_pseu.append(
                            tf.assign(state_pseu2[i],
                                      templates2,
                                      validate_shape=True))
                        self.init_pseu_img.append(
                            tf.assign(image_pseu[i],
                                      exemplar_images,
                                      validate_shape=True))

                self.image_pseu = image_pseu
                self.pseu_temp = state_pseu
                self.pseu_temp2 = state_pseu2

                state_pseus = tf.concat([self.templates] + state_pseu +
                                        [state_mem1], 0)
                sp_shape = state_pseus.get_shape().as_list()[0]
                state_pseus_c = tf.reshape(state_pseus, [sp_shape, -1])
                state_pseus_dis = tf.map_fn(
                    lambda x: _euc_distance(state_pseus_c, x),
                    state_pseus_c,
                    dtype=state_pseus_c.dtype)
                state_pseus_dis = tf.reshape(state_pseus_dis,
                                             [sp_shape, sp_shape])[1:, :]
                state_pseus_dis = tf.reduce_sum(state_pseus_dis, -1)
                self.state_pseus_dis = state_pseus_dis
                _, state_pseus_idx = tf.nn.top_k(state_pseus_dis,
                                                 k=len(state_pseu))

                image_pseu_extra = tf.concat(image_pseu + [image_mem], 0)
                state_pseus2 = tf.concat(state_pseu2 + [state_mem2], 0)
                self.up_img = []
                self.up_pseu = []
                self.up2_pseu = []
                for i in range(len(state_pseu)):
                    with tf.control_dependencies([
                            state_pseus_idx, image_pseu_extra, state_pseus,
                            state_pseus2
                    ]):
                        self.up_pseu.append(
                            tf.assign(state_pseu[i],
                                      tf.expand_dims(
                                          state_pseus[state_pseus_idx[i] + 1],
                                          0),
                                      validate_shape=True))
                        self.up2_pseu.append(
                            tf.assign(state_pseu2[i],
                                      tf.expand_dims(
                                          state_pseus2[state_pseus_idx[i]], 0),
                                      validate_shape=True))
                        self.up_img.append(
                            tf.assign(image_pseu[i],
                                      tf.expand_dims(
                                          image_pseu_extra[state_pseus_idx[i]],
                                          0),
                                      validate_shape=True))
Ejemplo n.º 30
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        smooth_rate = self.track_config['smooth']
        update_interval = self.track_config['update_interval']
        feature_balance = self.track_config['feature_balance']

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)
        examplar = self.siamese_model.get_examplar(sess, input_feed)
        examplar_smooth = examplar
        st_template = []
        for i in range(self.siamese_model.train_config['time_range']):
            st_template.append(examplar)
        st_template_np = np.array(st_template)
        self.siamese_model.update_st_template_step(sess, st_template_np)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Set padding for refining search region
        img = mpimg.imread(frames[0])
        context_amount = self.track_config['context_amount']
        size_z = self.model_config['z_image_size']
        size_x = self.track_config['x_image_size']
        padding_h = 10
        padding_w = 10

        if original_target_height / original_target_width > 2:  #2
            padding_h = 1.4  #1.4
            padding_w = 6

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response2 = outputs['response2']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response2, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                response2 = response2[best_scale]
                response = feature_balance * response + (
                    1 - feature_balance) * response2
                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Refine the response
                base_z_size = np.array([
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ])
                base_z_context_size = base_z_size + context_amount * np.sum(
                    base_z_size)
                base_s_z = np.sqrt(
                    np.prod(base_z_context_size))  # Canonical size
                base_scale_z = size_z / base_s_z
                d_search = (size_x - size_z) / 2.0
                base_pad = d_search / base_scale_z
                base_s_x = base_s_z + 2 * base_pad

                if base_s_x / current_target_state.bbox.height > padding_h:
                    start_h = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.height * padding_h) /
                        (2 * base_s_x))
                    end_h = np.floor(response_size - start_h)
                    start_h = np.int(start_h)
                    end_h = np.int(end_h)
                    response[0:start_h, :] = 0
                    response[end_h:-1, :] = 0
                if base_s_x / current_target_state.bbox.width > padding_w:
                    start_w = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.width * padding_w) /
                        (2 * base_s_x))
                    end_w = np.floor(response_size - start_w)
                    start_w = np.int(start_w)
                    end_w = np.int(end_w)
                    response[:, :start_w] = 0
                    response[:, end_w:] = 0

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                # Update the spatial-temporal template using gcn
                if i % update_interval == 0:
                    bbox_feed = [
                        current_target_state.bbox.y,
                        current_target_state.bbox.x,
                        current_target_state.bbox.height,
                        current_target_state.bbox.width
                    ]
                    input_feed = [filename, bbox_feed]
                    current_examplar = self.siamese_model.get_examplar(
                        sess, input_feed)
                    # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:]
                    examplar_smooth = current_examplar
                    current_examplar = smooth_rate * examplar_smooth + (
                        1 - smooth_rate) * examplar
                    st_template.pop(1)
                    st_template.append(current_examplar)
                    st_template_np = np.array(st_template)
                    self.siamese_model.update_st_template_step(
                        sess, st_template_np)

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 31
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                    if np.max(response_max) < 0:
                        logging.warning('MAX_RESPONSE LESS THAN ZERO!')
                        # best_scale = current_scale_idx
                else:
                    best_scale = 0

                response = response[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 32
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [
            frames[0], bbox_feed, self.x_image_size, self.search_factors
        ]
        frame2crop_scale, image_z = self.siamese_model.initialize(
            sess, input_feed)
        imwrite(osp.join(logdir, 'aimagez.jpg'),
                cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR))

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        image_c = None
        x_image_size = self.x_image_size
        lost = 0
        moved2border = False

        conf_thresh = 0.2  # 0.2
        bound_thresh = 0.2  # 0.2
        sup_thresh = 0.15  # 0.15
        prev_score = conf_thresh + 0.01
        upsample_factor = self.track_config['upsample_factor']
        search_factors = self.search_factors

        for i, filename in enumerate(frames):
            if i > 0 or include_first:
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]

                if prev_score > bound_thresh:
                    lost = 0
                else:
                    lost += 1

                if prev_score > 0.9:
                    self.siamese_model.update(sess, [
                        frames[i - 1], bbox_feed, self.x_image_size,
                        search_factors
                    ])

                with open(filename, 'rb') as f:
                    wi, hi = GetWidthAndHeight(f)
                t_i_ratio = max([
                    current_target_state.bbox.height / hi,
                    current_target_state.bbox.width / wi
                ])

                if prev_score < conf_thresh:
                    x_image_size += 100
                    #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init)
                    if t_i_ratio < 0.05:
                        x_image_size = min(x_image_size, 555)
                    elif t_i_ratio < 0.25:
                        x_image_size = min(x_image_size, 455)
                    elif t_i_ratio > 0.5:
                        x_image_size = min(x_image_size, 255)
                    else:
                        x_image_size = min(x_image_size, 355)
                else:
                    x_image_size = self.x_image_size

                if i > 1:
                    top = (current_target_state.bbox.y -
                           (current_target_state.bbox.height / 2) < 10)
                    left = (current_target_state.bbox.x -
                            (current_target_state.bbox.width / 2) < 10)
                    bottom = (current_target_state.bbox.y +
                              (current_target_state.bbox.height / 2) > hi - 10)
                    right = (current_target_state.bbox.x +
                             (current_target_state.bbox.width / 2) > wi - 10)
                    bound_flag = top or left or bottom or right
                    #if top or left or bottom or right:
                    #if not prev_score < bound_thresh:
                    #moved2border = True
                    #if not moved2border:
                    #current_target_state.bbox = Rectangle(wi / 2, hi / 2,
                    #current_target_state.bbox.width,
                    #current_target_state.bbox.height)
                    #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                    #current_target_state.bbox.height, current_target_state.bbox.width]
                    #else:
                    #if not prev_score < bound_thresh:
                    #moved2border = False

                if lost > 5 and bound_flag:
                    lost = 0
                    diffy = hi * 0.5 - bbox_feed[0]
                    diffx = wi * 0.5 - bbox_feed[1]
                    bbox_feed = [
                        diffy * 0.25 + bbox_feed[0],
                        diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3]
                    ]

                current_target_state.bbox = Rectangle(bbox_feed[1],
                                                      bbox_feed[0],
                                                      bbox_feed[3],
                                                      bbox_feed[2])

                input_feed = [
                    filename, bbox_feed, x_image_size, search_factors
                ]
                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]
                re_out = np.around(1 / (1 + np.exp(-response)), 2)

                if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5:
                    x_image_sizeb4 = x_image_size
                    x_image_size += 100
                    #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init
                    if t_i_ratio < 0.05:
                        x_image_size_l = 555
                    elif t_i_ratio < 0.25:
                        x_image_size_l = 455
                    elif t_i_ratio > 0.5:
                        x_image_size_l = 255
                    else:
                        x_image_size_l = 355

                    if not x_image_size > x_image_size_l:
                        input_feed = [
                            filename, bbox_feed, x_image_size, search_factors
                        ]
                        outputs, metadata = self.siamese_model.inference_step(
                            sess, input_feed)
                        search_scale_list = outputs['scale_xs']
                        response = outputs['response']
                        response_size = response.shape[1]
                        re_out = np.around(1 / (1 + np.exp(-response)), 2)
                    else:
                        x_image_size = x_image_sizeb4

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response * (re_out > sup_thresh),
                                          axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    if max(response_penalized) == 0.:
                        best_scale = 1
                    else:
                        best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                re_out = re_out[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)
                    response = response * (re_out > sup_thresh)

                window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                                np.expand_dims(np.hanning(response_size), 0))
                self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                if np.max(re_out) < sup_thresh:
                    r_max, c_max = response.shape
                    r_max, c_max = int(r_max / 2), int(c_max / 2)
                    disp_instance_input = [0, 0]
                    disp_instance_frame = [0, 0]
                else:
                    # Find maximum response
                    r_max, c_max = np.unravel_index(response.argmax(),
                                                    response.shape)

                    # Convert from crop-relative coordinates to frame coordinates
                    p_coor = np.array([r_max, c_max])
                    # displacement from the center in instance final representation ...
                    disp_instance_final = p_coor - get_center(response_size)
                    # ... in instance feature space ...
                    disp_instance_feat = disp_instance_final / upsample_factor
                    # ... Avoid empty position ...
                    r_radius = int(response_size / upsample_factor / 2)
                    disp_instance_feat = np.maximum(
                        np.minimum(disp_instance_feat, r_radius), -r_radius)
                    # ... in instance input ...
                    disp_instance_input = disp_instance_feat * self.model_config[
                        'embed_config']['stride']
                    # ... in instance original crop (in frame coordinates)
                    disp_instance_frame = disp_instance_input / search_scale_list[
                        best_scale]

                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]
                y = np.round(y)
                x = np.round(x)
                prev_score = re_out[r_max, c_max]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)

                # Some book keeping
                search_center = np.array(
                    [get_center(x_image_size),
                     get_center(x_image_size)])
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')

                    # Add score colormap
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    #im_shape = image_cropped.shape
                    #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride']
                    #pad = int((im_shape[0] - re_shape) / 2)
                    #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad])
                    #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant')
                    #response_crop = response_crop / response_crop.max()
                    #response_crop = np.uint8(response_crop * 255)
                    #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB)
                    #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0)

                    xmin = bbox_search.x.astype(np.int32)
                    ymin = bbox_search.y.astype(np.int32)
                    xmax = xmin + bbox_search.width.astype(np.int32)
                    ymax = ymin + bbox_search.height.astype(np.int32)
                    cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax),
                                  (255, 0, 0), 2)
                    text = str(prev_score)
                    cv2.putText(image_cropped,
                                text, (xmin, ymin),
                                cv2.FONT_HERSHEY_SIMPLEX,
                                1.0, (255, 0, 0),
                                lineType=cv2.LINE_AA)
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    #if image_c is not None:
                    #his_dir = logdir + '_his'
                    #if not osp.exists(his_dir):
                    #os.mkdir(his_dir)
                    #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0]
                    #image_c_p = np.uint8(image_c_p)
                    #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)),
                    #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR))

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 33
0
    def track_vot(self, sess, frame):
        bbox_feed = [
            self.vot_current_target_state.bbox.y,
            self.vot_current_target_state.bbox.x,
            self.vot_current_target_state.bbox.height,
            self.vot_current_target_state.bbox.width
        ]
        input_feed = [frame, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response_s_c5 = outputs['response_s_c5']
        response_s_c4 = outputs['response_s_c4']
        response_s_c3 = outputs['response_s_c3']
        response_a_c5 = outputs['response_a_c5']
        response_a_c4 = outputs['response_a_c4']
        response_a_c3 = outputs['response_a_c3']
        response_size = response_s_c5.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:

            response_a_c5_max = np.max(response_a_c5)
            response_a_c4_max = np.max(response_a_c4)
            response_a_c3_max = np.max(response_a_c3)
            response_a_c5 = response_a_c5 / response_a_c5_max
            response_a_c4 = response_a_c4 / response_a_c4_max
            response_a_c3 = response_a_c3 / response_a_c3_max

            response_s_all = 0.7 * response_s_c5 + 0.3 * response_s_c4 + 0.1 * response_s_c3

            response_a_all = 0.3 * response_a_c5 + 0.6 * response_a_c4 + 0.1 * response_a_c3

            response_s_all_max = np.max(response_s_all)
            response_s_all = response_s_all / response_s_all_max

            response_a_all_max = np.max(response_a_all)
            response_a_all = response_a_all / response_a_all_max
            response = 0.3 * response_s_all + 0.7 * response_a_all

            response_max = np.max(response, axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (self.num_scales))
            current_scale_idx = int(get_center(self.num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            best_scale = np.argmax(response_penalized)
        else:
            ## TODO combine siamfc and alexnet
            best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)

        if self.window is None:
            window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                            np.expand_dims(np.hanning(response_size), 0))
            self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * 8  #self.model_config['embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = self.vot_current_target_state.bbox.y
        x = self.vot_current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = self.vot_current_target_state.bbox.height / self.vot_original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = self.vot_original_target_height * target_scale
        width = self.vot_original_target_width * target_scale
        self.vot_current_target_state.bbox = Rectangle(x, y, width, height)
        self.vot_current_target_state.scale_idx = best_scale
        self.vot_current_target_state.search_pos = self.vot_search_center + disp_instance_input

        reported_bbox = convert_bbox_format(self.vot_current_target_state.bbox,
                                            'top-left-based')
        return reported_bbox
Ejemplo n.º 34
0
    def track(self, sess, frame, logdir='/tmp'):
        """Runs tracking on a single image."""
        i = self.i = self.i + 1
        current_target_state = self.current_target_state
        original_target_height = self.original_target_height
        original_target_width = self.original_target_width
        search_center = self.search_center

        mem_count = self.mem_count
        moved2border = self.moved2border
        update_delay = self.update_delay + 1
        lost = self.lost + 1
        image_c = self.image_c
        x_image_size = self.x_image_size
        search_factors = self.search_factors_init
        conf_thresh = self.conf_thresh
        bound_thresh = self.bound_thresh
        sup_thresh = self.sup_thresh
        prev_score = self.prev_score

        hi, wi, _ = frame.shape
        h_ratio = current_target_state.bbox.height / hi
        w_ratio = current_target_state.bbox.width / wi
        t_i_ratio = max([h_ratio, w_ratio])

        if prev_score < conf_thresh:
            x_image_size += 100
            #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init)
            if t_i_ratio < 0.05:
                x_image_size = min(x_image_size, 555)
            elif t_i_ratio > 0.6:
                x_image_size = min(x_image_size, 255)
            elif t_i_ratio > 0.4:
                x_image_size = min(x_image_size, 355)
            else:
                x_image_size = min(x_image_size, 455)
        else:
            x_image_size = self.x_image_size_init

        num_scales = len(search_factors)
        bbx = current_target_state.bbox.x
        bby = current_target_state.bbox.y
        bbw = current_target_state.bbox.width
        bbh = current_target_state.bbox.height
        bbox_feed = [bby, bbx, bbh, bbw]

        if i > 1:
            top = (current_target_state.bbox.y -
                   (current_target_state.bbox.height / 2) < 10)
            left = (current_target_state.bbox.x -
                    (current_target_state.bbox.width / 2) < 10)
            bottom = (current_target_state.bbox.y +
                      (current_target_state.bbox.height / 2) > hi - 10)
            right = (current_target_state.bbox.x +
                     (current_target_state.bbox.width / 2) > wi - 10)
            if top or left or bottom or right:
                if not prev_score < bound_thresh:
                    moved2border = True
                if not moved2border:
                    current_target_state.bbox = Rectangle(
                        wi / 2, hi / 2, current_target_state.bbox.width,
                        current_target_state.bbox.height)
                    bbox_feed = [
                        current_target_state.bbox.y,
                        current_target_state.bbox.x,
                        current_target_state.bbox.height,
                        current_target_state.bbox.width
                    ]
            else:
                if not prev_score < bound_thresh:
                    moved2border = False

        if t_i_ratio < 0.3 and lost > 5:
            lost = 0
            diffy = hi * 0.5 - bbox_feed[0]
            diffx = wi * 0.5 - bbox_feed[1]
            bbox_feed = [
                diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1],
                bbox_feed[2], bbox_feed[3]
            ]

        current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0],
                                              bbox_feed[3], bbox_feed[2])

        input_feed = [frame, bbox_feed, x_image_size, search_factors]
        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]
        re_out = np.around(1 / (1 + np.exp(-response)), 2)

        if np.max(re_out) < conf_thresh:
            x_image_sizeb4 = x_image_size
            x_image_size += 100
            #x_image_size_l = ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init
            if t_i_ratio < 0.05:
                x_image_size_l = 555
            elif t_i_ratio > 0.6:
                x_image_size_l = 255
            elif t_i_ratio > 0.4:
                x_image_size_l = 355
            else:
                x_image_size_l = 455

            if not x_image_size > x_image_size_l:
                input_feed = [frame, bbox_feed, x_image_size, search_factors]
                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]
                re_out = np.around(1 / (1 + np.exp(-response)), 2)
            else:
                x_image_size = x_image_sizeb4

        # Choose the scale whole response map has the highest peak
        if num_scales > 1:
            response_max = np.max(response * (re_out > sup_thresh),
                                  axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (num_scales))
            current_scale_idx = int(get_center(num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            if max(response_penalized) == 0.:
                best_scale = 1
            else:
                best_scale = np.argmax(response_penalized)
        else:
            best_scale = 0

        response = response[best_scale]
        re_out = re_out[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)
            response = response * (re_out > sup_thresh)

        window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
        self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)
        prev_score = re_out[r_max, c_max]

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config[
            'embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = current_target_state.bbox.y
        x = current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]
        y = np.round(y)
        x = np.round(x)

        # Target scale damping and saturation
        target_scale = current_target_state.bbox.height / original_target_height
        search_factor = search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)

        # Some book keeping
        search_center = np.array(
            [get_center(x_image_size),
             get_center(x_image_size)])
        height = original_target_height * target_scale
        width = original_target_width * target_scale
        current_target_state.bbox = Rectangle(x, y, width, height)
        current_target_state.scale_idx = best_scale
        current_target_state.search_pos = search_center + disp_instance_input

        assert 0 <= current_target_state.search_pos[0] < x_image_size, \
          'target position in feature space should be no larger than input image size'
        assert 0 <= current_target_state.search_pos[1] < x_image_size, \
          'target position in feature space should be no larger than input image size'

        if self.log_level > 0:
            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(
                np.uint8)

            y_search, x_search = current_target_state.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search,
                                    target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')

            xmin = bbox_search.x.astype(np.int32)
            ymin = bbox_search.y.astype(np.int32)
            xmax = xmin + bbox_search.width.astype(np.int32)
            ymax = ymin + bbox_search.height.astype(np.int32)
            cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax),
                          (255, 0, 0), 2)
            text = str(prev_score)
            cv2.putText(image_cropped,
                        text, (xmin, ymin),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1.0, (255, 0, 0),
                        lineType=cv2.LINE_AA)
            imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                    cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

        if prev_score > self.store_thresh:
            bbox_feed = [
                current_target_state.bbox.y, current_target_state.bbox.x,
                current_target_state.bbox.height,
                current_target_state.bbox.width
            ]
            self.siamese_model.update_mem(sess, [
                frame, bbox_feed, self.x_image_size_init,
                self.search_factors_init, mem_count
            ])
            mem_count += 1

        if mem_count > 4 or (mem_count > 0 and update_delay > 5):
            self.siamese_model.update(sess)
            mem_count = 0
            update_delay = 0

        if prev_score > bound_thresh:
            lost = 0

        self.mem_count = mem_count
        self.update_delay = update_delay
        self.moved2border = moved2border
        self.lost = lost
        self.x_image_size = x_image_size
        self.prev_score = prev_score
        self.current_target_state = current_target_state
        reported_bbox = convert_bbox_format(current_target_state.bbox,
                                            'top-left-based')
        #return prev_score>0.4, reported_bbox, prev_score
        return prev_score > 0.4, reported_bbox
Ejemplo n.º 35
0
    def track(self, sess, frame):
        bbox_feed = [
            self.current_target_state.bbox.y, self.current_target_state.bbox.x,
            self.current_target_state.bbox.height,
            self.current_target_state.bbox.width
        ]
        input_feed = [frame, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
            response_max = np.max(response, axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (self.num_scales))
            current_scale_idx = int(get_center(self.num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            best_scale = np.argmax(response_penalized)
        else:
            best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)

        if self.window is None:
            window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                            np.expand_dims(np.hanning(response_size), 0))
            self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config[
            'embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = self.current_target_state.bbox.y
        x = self.current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = self.current_target_state.bbox.height / self.original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = self.original_target_height * target_scale
        width = self.original_target_width * target_scale
        self.current_target_state.bbox = Rectangle(x, y, width, height)
        self.current_target_state.scale_idx = best_scale
        self.current_target_state.search_pos = self.search_center + disp_instance_input

        assert 0 <= self.current_target_state.search_pos[0] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'
        assert 0 <= self.current_target_state.search_pos[1] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'

        reported_bbox = convert_bbox_format(self.current_target_state.bbox,
                                            'top-left-based')

        self.frame_cnt += 1
        if self.log_level > 0:
            np.save(osp.join(self.logdir, 'num_frames.npy'), [self.frame_cnt])

            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(
                np.uint8)
            # Note that imwrite in cv2 assumes the image is in BGR format.
            # However, the cropped image returned by TensorFlow is RGB.
            # Therefore, we convert color format using cv2.cvtColor
            cv2.imwrite(
                osp.join(self.logdir,
                         'image_cropped{}.jpg'.format(self.frame_cnt)),
                cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))
            cv2.imwrite(
                osp.join(self.logdir,
                         'image_origin{}.jpg'.format(self.frame_cnt)),
                cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

            np.save(
                osp.join(self.logdir,
                         'best_scale{}.npy'.format(self.frame_cnt)),
                [best_scale])
            np.save(
                osp.join(self.logdir, 'response{}.npy'.format(self.frame_cnt)),
                response)

            y_search, x_search = self.current_target_state.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search,
                                    target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
            np.save(osp.join(self.logdir, 'bbox{}.npy'.format(self.frame_cnt)),
                    [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])
            with open(osp.join(self.logdir, 'track_rect.txt'), 'a') as f:
                rect_str = '{},{},{},{}\n'.format(int(reported_bbox[0]),
                                                  int(reported_bbox[1]),
                                                  int(reported_bbox[2]),
                                                  int(reported_bbox[3]))
                f.write(rect_str)

        return reported_bbox
Ejemplo n.º 36
0
    def build_detection(self):
        track_config = self.track_config
        [self.embeds_ini,
         self.embeds] = self.get_image_embedding(self.search_images,
                                                 reuse=True)
        center_scale = int(get_center(track_config['num_scales']))
        new_template = tf.identity(self.templates[center_scale])
        x_size = self.embeds.get_shape().as_list()
        hw_size = x_size[2]
        c_size = x_size[3]
        z_size = new_template.get_shape().as_list()
        temp_size = z_size[1]
        temp_c_size = z_size[-1]
        final_temp_c_size = self.gcn_config['g2_output']
        with tf.variable_scope('instance_gcn_layer',
                               'instance_gcn_layer',
                               reuse=tf.AUTO_REUSE):
            with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                                padding='VALID'):
                x_region_merged = tf.nn.relu(self.embeds)
                x_region_merged = slim.conv2d(
                    x_region_merged,
                    temp_c_size,
                    [3, 3],
                    1,
                    scope='conv_att',
                    padding='SAME',
                )
                x_region_merged = slim.max_pool2d(x_region_merged,
                                                  [hw_size, hw_size], 1)
                x_region_merged = tf.reshape(x_region_merged,
                                             [-1, c_size, 1, 1])
                x_region_merged = slim.conv2d(x_region_merged,
                                              temp_size * temp_size, [1, 1],
                                              1,
                                              scope='conv_s')
                x_region_merged = tf.identity(x_region_merged[center_scale])
                x_region_merged = tf.transpose(x_region_merged, perm=[2, 0, 1])
                x_region_merged = tf.reshape(x_region_merged,
                                             [temp_size, temp_size, c_size])
                z_merged = tf.add(new_template, x_region_merged)
                z_merged = tf.expand_dims(z_merged, 0)
                support_att = attention(z_merged, c_size)
                self.support_att = support_att
                new_template = tf.reshape(new_template,
                                          [temp_size * temp_size, temp_c_size])
                new_template = gcn_tracking2(gcn_config=self.gcn_config,
                                             inputs=tf.squeeze(new_template),
                                             supports=support_att)
                new_template = tf.reshape(
                    new_template, [temp_size, temp_size, final_temp_c_size])
                self.templates_final = tf.stack(
                    [new_template for _ in range(track_config['num_scales'])])
        with tf.variable_scope('detection'):

            def _translation_match(x, z):
                x = tf.expand_dims(
                    x, 0)  # [batch, in_height, in_width, in_channels]
                z = tf.expand_dims(
                    z, -1
                )  # [filter_height, filter_width, in_channels, out_channels]
                return tf.nn.conv2d(x,
                                    z,
                                    strides=[1, 1, 1, 1],
                                    padding='VALID',
                                    name='translation_match')

            output = tf.map_fn(
                lambda x: _translation_match(x[0], x[1]),
                (self.embeds, self.templates_final),
                dtype=self.embeds.dtype)  # of shape [3, 1, 17, 17, 1]
            output = tf.squeeze(output, [1, 4])  # of shape e.g. [3, 17, 17]

            bias = tf.get_variable('biases', [1],
                                   dtype=tf.float32,
                                   initializer=tf.constant_initializer(
                                       0.0, dtype=tf.float32),
                                   trainable=False)
            response = self.model_config['adjust_response_config'][
                'scale'] * output + bias
            self.response = response
Ejemplo n.º 37
0
  def track(self, sess, first_bbox, frames, logdir='/tmp'):
    """Runs tracking on a single image sequence."""
    # Get initial target bounding box and convert to center based
    bbox = convert_bbox_format(first_bbox, 'center-based')
    print(frames)
    # Feed in the first frame image to set initial state.
    bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
    input_feed = [frames[0], bbox_feed]
    frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

    # Storing target state
    original_target_height = bbox.height
    original_target_width = bbox.width
    search_center = np.array([get_center(self.x_image_size),
                              get_center(self.x_image_size)])
    current_target_state = TargetState(bbox=bbox,
                                       search_pos=search_center,
                                       scale_idx=int(get_center(self.num_scales)))

    include_first = get(self.track_config, 'include_first', False)
    logging.info('Tracking include first -- {}'.format(include_first))

    # Run tracking loop
    reported_bboxs = []
    output_json={} #dump all bboxes in this output file

    for i, filename in enumerate(frames):
      if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
        bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                     current_target_state.bbox.height, current_target_state.bbox.width]
        input_feed = [filename, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
          response_max = np.max(response, axis=(1, 2))
          penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
          current_scale_idx = int(get_center(self.num_scales))
          penalties[current_scale_idx] = 1.0
          response_penalized = response_max * penalties
          best_scale = np.argmax(response_penalized)
        else:
          best_scale = 0

        response = response[best_scale]
        #print(response)
        

        with np.errstate(all='raise'):  # Raise error if something goes wrong
          response = response - np.min(response)
          response = response / np.sum(response)

        if self.window is None:
          window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                          np.expand_dims(np.hanning(response_size), 0))
          self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        srtd=response.argsort(axis=None)
        v =  response.argmax()
        r_max, c_max = np.unravel_index(v,
                                        response.shape)


        if not osp.exists(osp.join(logdir,"Intermediate")):
          os.mkdir(osp.join(logdir,"Intermediate"))

        to_save = np.interp(response,(response.min(),response.max()),(0,255))
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save)
        
        to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1)
        ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY)
        
        
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1)
        image = np.uint8(thresh1.copy())
        
        cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
        cnts = imutils.grab_contours(cnts)
        backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
        image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2)
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image)
        
        centres=[]
        for c in cnts:
          M = cv2.moments(c)
          cX = int(M["m10"] / M["m00"])
          cY = int(M["m01"] / M["m00"])
          centres.append((cY,cX,False))
        centres.append((r_max,c_max,True))
        #print(centres)

        #cts_copy = copy(current_target_state)
        #cts_copy2 = copy(current_target_state)
        output_json[filename]=[]

        for (r_max,c_max,to_deep_copy) in centres:
          if to_deep_copy:
            cts_copy = deepcopy(current_target_state)
          else:
            cts_copy = copy(current_target_state)
          # Convert from crop-relative coordinates to frame coordinates
          p_coor = np.array([r_max, c_max])
          # displacement from the center in instance final representation ...
          disp_instance_final = p_coor - get_center(response_size)
          # ... in instance feature space ...
          upsample_factor = self.track_config['upsample_factor']
          disp_instance_feat = disp_instance_final / upsample_factor
          # ... Avoid empty position ...
          r_radius = int(response_size / upsample_factor / 2)
          disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
          # ... in instance input ...
          disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
          # ... in instance original crop (in frame coordinates)
          disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
          # Position within frame in frame coordinates
          y = cts_copy.bbox.y
          x = cts_copy.bbox.x
          y += disp_instance_frame[0]
          x += disp_instance_frame[1]

          # Target scale damping and saturation
          target_scale = cts_copy.bbox.height / original_target_height
          search_factor = self.search_factors[best_scale]
          scale_damp = self.track_config['scale_damp']  # damping factor for scale update
          target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
          target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

          # Some book keeping
          height = original_target_height * target_scale
          width = original_target_width * target_scale
          
          cts_copy.bbox = Rectangle(x, y, width, height)
          cts_copy.scale_idx = best_scale
          cts_copy.search_pos = search_center + disp_instance_input

          assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'
          assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'

          if self.log_level > 0 and to_deep_copy:
            np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
            # Note that imwrite in cv2 assumes the image is in BGR format.
            # However, the cropped image returned by TensorFlow is RGB.
            # Therefore, we convert color format using cv2.cvtColor
            imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                    cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

            np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
            np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

            y_search, x_search = cts_copy.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
            np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                    [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])

          reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based')
          #print(f"reported bbox {reported_bbox}")
          if to_deep_copy:
            reported_bboxs.append(reported_bbox)
          else:
            rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1,
                                              reported_bbox.width, reported_bbox.height)
            arr = output_json[filename]
            arr.append(rect_str)


    
    with open(osp.join(logdir,'bboxes.json'),'w') as f:
      json.dump(output_json,f,indent=4)
    return reported_bboxs