Ejemplo n.º 1
0
    def prepare_data(self):
        # Parameter prepare
        dataset_dir = self.dataSet_dir
        input_dir = self.config['input_dir']
        output_dir = self.config['output_dir']
        crop_h = self.config['crop_h']
        crop_w = self.config['crop_w']
        threads = self.config['prefetch_threads']
        img_mean = get(self.config, 'img_mean', None)
        preprocess_name = get(self.config, 'preprocessing_name', None)
        random_scale = get(self.config, 'random_scale', False)
        random_mirror = get(self.config, 'random_mirror', True)
        batch_size = get(self.config, 'batch_size', 8)

        input_names = []
        output_names = []
        for file in os.listdir(osp.join(dataset_dir, input_dir)):
            cwd = os.getcwd()
            input_names.append(cwd + "/" + osp.join(dataset_dir, input_dir) +
                               "/" + file)
        for file in os.listdir(osp.join(dataset_dir, output_dir)):
            cwd = os.getcwd()
            output_names.append(cwd + "/" + osp.join(dataset_dir, output_dir) +
                                "/" + file)

        input_names.sort(), output_names.sort()

        dataset = tf.data.Dataset.from_tensor_slices(
            (input_names, output_names))
        dataset = dataset.map(
            lambda x, y: _parse_function(x, y, img_mean, self.class_dict),
            num_parallel_calls=threads)

        logging.info('preproces -- {}'.format(preprocess_name))
        if preprocess_name == 'augment':
            if random_mirror:
                dataset = dataset.map(_image_mirroring,
                                      num_parallel_calls=threads)
            if random_scale:
                dataset = dataset.map(_image_scaling,
                                      num_parallel_calls=threads)

            dataset = dataset.map(
                lambda x, y: _random_crop_and_pad_image_and_labels(
                    x, y, crop_h, crop_w),
                num_parallel_calls=threads)
            dataset = dataset.map(
                lambda image, label: _apply_with_random_selector(
                    image,
                    lambda x, ordering: _distort_color(
                        x, ordering, fast_mode=True),
                    num_cases=4,
                    label=label))

        dataset = dataset.map(
            lambda image, label: _check_size(image, label, crop_h, crop_w))
        dataset = dataset.shuffle(buffer_size=100)
        dataset = dataset.batch(batch_size)
        dataset = dataset.repeat()
        self.dataset = dataset
Ejemplo n.º 2
0
  def __init__(self, config, is_training):
    self.config = config
    self.is_training = is_training

    preprocess_name = get(config, 'preprocessing_name', None)
    logging.info('preproces -- {}'.format(preprocess_name))

    if preprocess_name == 'siamese_fc_color':
      self.v_transform = None
      # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ?
      self.z_transform = Compose([RandomStretch(),
                                  CenterCrop((255 - 8, 255 - 8)),
                                  RandomCrop(255 - 2 * 8),
                                  CenterCrop((127, 127))])
      self.x_transform = Compose([RandomStretch(),
                                  CenterCrop((255 - 8, 255 - 8)),
                                  RandomCrop(255 - 2 * 8), ])
    elif preprocess_name == 'siamese_fc_gray':
      self.v_transform = RandomGray()
      self.z_transform = Compose([RandomStretch(),
                                  CenterCrop((255 - 8, 255 - 8)),
                                  RandomCrop(255 - 2 * 8),
                                  CenterCrop((127, 127))])
      self.x_transform = Compose([RandomStretch(),
                                  CenterCrop((255 - 8, 255 - 8)),
                                  RandomCrop(255 - 2 * 8), ])
    elif preprocess_name == 'None':
      self.v_transform = None
      self.z_transform = CenterCrop((127, 127))
      self.x_transform = CenterCrop((255, 255))
    else:
      raise ValueError('Preprocessing name {} was not recognized.'.format(preprocess_name))

    self.dataset_py = VID(config['input_imdb'], config['max_frame_dist'])
    self.sampler = Sampler(self.dataset_py, shuffle=is_training)
Ejemplo n.º 3
0
def convolutional_alexnet_gn_arg_scope(embed_config,
                                       trainable=True,
                                       is_training=True):
    is_model_training = trainable and is_training
    if get(embed_config, 'use_gn', True):
        norm_params = {
            "trainable": trainable,
        }
        normalizer_fn = group_norm
    else:
        norm_params = {}
        normalizer_fn = None

    weight_decay = get(embed_config, 'weight_decay', 1e-4)
    if trainable:
        weights_regularizer = slim.l2_regularizer(weight_decay)
    else:
        weights_regularizer = None

    init_method = get(embed_config, 'init_method', 'kaiming_normal')
    if is_model_training:
        logging.info('embedding init method -- {}'.format(init_method))
    if init_method == 'kaiming_normal':
        # The same setting as siamese-fc
        initializer = slim.variance_scaling_initializer(factor=2.0,
                                                        mode='FAN_OUT',
                                                        uniform=False)
    else:
        initializer = slim.xavier_initializer()

    with slim.arg_scope([slim.conv2d],
                        weights_regularizer=weights_regularizer,
                        weights_initializer=initializer,
                        padding='VALID',
                        trainable=trainable,
                        activation_fn=tf.nn.relu,
                        normalizer_fn=normalizer_fn,
                        normalizer_params=norm_params):
        with slim.arg_scope([group_norm], **norm_params):
            with slim.arg_scope([group_norm]) as arg_sc:
                return arg_sc
Ejemplo n.º 4
0
    def __init__(self, config, is_training):
        self.config = config
        self.is_training = is_training

        preprocess_name = get(config, 'preprocessing_name', None)
        logging.info('preproces -- {}'.format(preprocess_name))

        if preprocess_name == 'siamese_fc_color':
            self.v_transform = None
            # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ?
            self.z_transform = Compose([
                RandomStretch(),
                CenterCrop((255 - 8, 255 - 8)),
                RandomCrop(255 - 2 * 8),
                CenterCrop((127, 127))
            ])
            self.x_transform = Compose([
                RandomStretch(),
                CenterCrop((255 - 8, 255 - 8)),
                RandomCrop(255 - 2 * 8),
            ])
        elif preprocess_name == 'siamese_fc_gray':
            self.v_transform = RandomGray()
            self.z_transform = Compose([
                RandomStretch(),
                CenterCrop(
                    (255 - 8, 255 - 8)
                ),  # embeding stride: 8, robustness: should also be center
                RandomCrop(255 - 2 * 8),
                CenterCrop((127, 127))
            ])
            self.x_transform = Compose([
                RandomStretch(),
                CenterCrop((255 - 8, 255 - 8)),
                RandomCrop(255 - 2 * 8),
            ])
        elif preprocess_name == 'None':
            self.v_transform = None
            self.z_transform = CenterCrop((127, 127))
            self.x_transform = CenterCrop((255, 255))
        else:
            raise ValueError(
                'Preprocessing name {} was not recognized.'.format(
                    preprocess_name))

        self.dataset_py = VID(config['input_imdb'], config['max_frame_dist'])
        self.sampler = Sampler(self.dataset_py, shuffle=is_training)
Ejemplo n.º 5
0
def sa_siam_arg_scope(embed_config,
                      trainable=True,
                      is_training=False):
  """Defines the default arg scope.

  Args:
    embed_config: A dictionary which contains configurations for the embedding function.
    trainable: If the weights in the embedding function is trainable.
    is_training: If the embedding function is built for training.

  Returns:
    An `arg_scope` to use for the SA-Siam models.
  """
  # Only consider the model to be in training mode if it's trainable.
  # This is vital for batch_norm since moving_mean and moving_variance
  # will get updated even if not trainable.
  is_model_training = trainable and is_training

  if get(embed_config, 'use_bn', True):
    batch_norm_scale = get(embed_config, 'bn_scale', True)
    batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4)
    batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6)
    batch_norm_params = {
      "scale": batch_norm_scale,
      # Decay for the moving averages.
      "decay": batch_norm_decay,
      # Epsilon to prevent 0s in variance.
      "epsilon": batch_norm_epsilon,
      "trainable": trainable,
      "is_training": is_model_training,
      # Collection containing the moving mean and moving variance.
      "variables_collections": {
        "beta": None,
        "gamma": None,
        "moving_mean": ["moving_vars"],
        "moving_variance": ["moving_vars"],
      },
      'updates_collections': None,  # Ensure that updates are done within a frame
    }
    normalizer_fn = slim.batch_norm
  else:
    batch_norm_params = {}
    normalizer_fn = None

  weight_decay = get(embed_config, 'weight_decay', 5e-4)
  if trainable:
    weights_regularizer = slim.l2_regularizer(weight_decay)
  else:
    weights_regularizer = None

  init_method = get(embed_config, 'init_method', None)
  if is_model_training:
    logging.info('embedding init method -- {}'.format(init_method))
  if init_method == 'kaiming_normal':
    # The same setting as siamese-fc
    initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False)
  else:
    initializer = slim.xavier_initializer()

  with slim.arg_scope(
      [slim.conv2d],
      weights_regularizer=weights_regularizer,
      weights_initializer=initializer,
      padding='VALID',
      trainable=trainable,
      activation_fn=tf.nn.relu,
      normalizer_fn=normalizer_fn,
      normalizer_params=batch_norm_params):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc:
          return arg_sc
Ejemplo n.º 6
0
def sa_siam(inputs,
            is_example,
            sa_siam_config={},
            reuse=None,
            scope='sa_siam'):
  en_appearance = get(sa_siam_config, 'en_appearance', False)
  en_semantic = get(sa_siam_config, 'en_semantic', False)
  n_out = get(sa_siam_config, 'n_out', 256)
  all_combine_layers_appearance = get(sa_siam_config, 'all_combine_layers_appearance', {'conv5':1.0})
  all_combine_layers_semantic = get(sa_siam_config, 'all_combine_layers_semantic', {'conv5':1.0, 'conv4':0.1})
  sz_conv5_z = get(sa_siam_config, 'sz_conv5_z', 6)
  en_semantic_att = get(sa_siam_config, 'en_semantic_att', True)

  with tf.variable_scope(scope, 'sa_siam', [inputs], reuse=reuse) as sc:
    end_points_collection = sc.name + '_end_points'
    with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                        outputs_collections=end_points_collection):
      def proc_raw_all_feat(feat, is_appearance, n_out_cur, all_combine_layers):
        res = []
        max_feat_size = 0
        for l in range(1,6):
          for k in all_combine_layers.keys():
            if k.find(str(l)) != -1:
              if shape_of(feat[l-1])[3] is None:
                res.append(feat[l-1])
                break
              if l == 5 and is_appearance and abs(n_out_cur - shape_of(feat[l-1])[3]) < 0.1:
                res.append(feat[l-1])
              else:
                if not is_appearance:
                  feat[l-1] *= all_combine_layers[k] # Multiple scale for convergence during training
                with slim.arg_scope([slim.conv2d],activation_fn=None, normalizer_fn=None):
                  c1x1 = slim.conv2d(feat[l-1], n_out_cur, [1,1], 1, scope='c1x1_' + k)
                res.append(c1x1)
              logging.info('Keep {} .. is_appearance={} shape={}'.format(k,is_appearance,shape_of(res[-1])))
        return res
      def re_weight_crop(feat, all_combine_layers, only_crop=False):
        feat_shape = list(map(shape_of, feat))
        res = []
        for l in range(1,6): # proc layers from 1 to 5 in order
          for k in all_combine_layers.keys(): # find the corresponding layer in all layers
            if k.find(str(l)) != -1:
              logging.info('For layer {} ...'.format(k))
              cur_ly_idx = l - 1
              if feat_shape[cur_ly_idx][2] is None and feat_shape[4][2] is None:
                res.append(feat[cur_ly_idx])
                break
              pad_val = feat_shape[cur_ly_idx][2] - feat_shape[4][2]
              sz_conv5_z_cur = pad_val + sz_conv5_z
              sz_conv5_x_cur = feat_shape[cur_ly_idx][2]
              n_left = int((sz_conv5_x_cur - sz_conv5_z_cur) / 2 + 0.5)
              div_left_st = [0, n_left, n_left + sz_conv5_z_cur, sz_conv5_x_cur]
              logging.info('.. Crop as {}'.format(div_left_st)) # crop 9 patchs and max pool each patch
              if not only_crop:
                all_max = []
                for j in [0,1,2]:
                  for i in [0,1,2]:
                    l_crop = div_left_st[i]
                    r_crop = div_left_st[i + 1]
                    u_crop = div_left_st[j]
                    d_crop = div_left_st[j+1]
                    max_patch = tf.reduce_max(feat[cur_ly_idx][:, u_crop:d_crop, l_crop:r_crop, :], axis=[1, 2]) #shape = [n, c]
                    all_max.append(max_patch)
                max_map = tf.stack(all_max, axis=2) #shape = [n, c, 9]
                logging.info('.. Max_map.shape = {}'.format(max_map.shape))
                max_map = slim.fully_connected(max_map, 9, scope='att_fc1_' + k) # fully_connected layer will only applied to the last dim
                logging.info('.. Max_map_fc1.shape = {}'.format(max_map.shape))
                max_map = slim.fully_connected(max_map, 1, scope='att_fc2_' + k, activation_fn=None, normalizer_fn=None,)
                logging.info('.. Max_map_fc2.shape = {}'.format(max_map.shape)) # shape = [n, c, 1]
                att_map = tf.reshape(max_map, [-1, 1, 1, feat_shape[cur_ly_idx][3]])
                logging.info('.. att_map.shape = {}'.format(att_map.shape))
                att_map = tf.sigmoid(att_map) + 0.5 # important bias for avoiding loss too much
                feat[cur_ly_idx] = att_map * feat[cur_ly_idx]
              feat[cur_ly_idx] = feat[cur_ly_idx][:, div_left_st[1]:div_left_st[2], div_left_st[1]:div_left_st[2], :] # crop center feat
              res.append(feat[cur_ly_idx])
              break
          else:
            res.append(None)
        return res
      layer_cur = inputs
      if en_appearance:
        n_out_appearance = n_out / len(all_combine_layers_appearance.keys())
        with tf.variable_scope('appearance_net'):
          _, feat_appearance_all = appearance_net(layer_cur)
          if is_example:
            feat_appearance_all = re_weight_crop(feat_appearance_all, all_combine_layers_appearance, only_crop=True)
          net_appearance = proc_raw_all_feat(feat_appearance_all, is_appearance=True, n_out_cur=n_out_appearance, all_combine_layers=all_combine_layers_appearance)
      if en_semantic:
        n_out_semantic = n_out / len(all_combine_layers_semantic.keys())
        with tf.variable_scope('semantic_net'):
          _, feat_semantic_all = semantic_net(layer_cur)
          if is_example:
            feat_semantic_all = re_weight_crop(feat_semantic_all, all_combine_layers_semantic, only_crop=not en_semantic_att)
          net_semantic = proc_raw_all_feat(feat_semantic_all, is_appearance=False, n_out_cur=n_out_semantic, all_combine_layers=all_combine_layers_semantic)
      if en_appearance and en_semantic:
        layer_cur = combine_sa_net(net_appearance, net_semantic)
      elif en_appearance:layer_cur = combine_sa_net(net_appearance, [])
      elif en_semantic:layer_cur = combine_sa_net(net_semantic, [])
      else: raise ValueError('Semantic or Appearance must enable one branch!')
      # Convert end_points_collection into a dictionary of end_points.
      end_points = slim.utils.convert_collection_to_dict(end_points_collection)
      return layer_cur, end_points
Ejemplo n.º 7
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                    if np.max(response_max) < 0:
                        logging.warning('MAX_RESPONSE LESS THAN ZERO!')
                        # best_scale = current_scale_idx
                else:
                    best_scale = 0

                response = response[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 8
0
    def prepare_data(self):
        # Parameter prepare
        dataset_dir = self.dataSet_dir
        crop_h = self.config['crop_h']
        crop_w = self.config['crop_w']
        threads = self.config['prefetch_threads']
        img_mean = get(self.config, 'img_mean', None)
        preprocess_name = get(self.config, 'preprocessing_name', None)
        random_scale = get(self.config, 'random_scale', False)
        random_mirror = get(self.config, 'random_mirror', True)
        batch_size = get(self.config, 'batch_size', 8)
        batch_size //= get(self.config, 'num_gpus', 1)

        if self.data_name == "CamVid":
            input_dir = self.config['input_dir']
            output_dir = self.config['output_dir']
            input_names = []
            output_names = []
            for file in os.listdir(osp.join(dataset_dir, input_dir)):
                cwd = os.getcwd()
                input_names.append(cwd + "/" +
                                   osp.join(dataset_dir, input_dir) + "/" +
                                   file)
            for file in os.listdir(osp.join(dataset_dir, output_dir)):
                cwd = os.getcwd()
                output_names.append(cwd + "/" +
                                    osp.join(dataset_dir, output_dir) + "/" +
                                    file)

            input_names.sort(), output_names.sort()

            dataset = tf.data.Dataset.from_tensor_slices(
                (input_names, output_names))
            dataset = dataset.map(
                lambda x, y: _parse_function(x, y, img_mean, self.class_dict),
                num_parallel_calls=threads)
        elif self.data_name == "MVD":
            split_name = self.config['split']
            file_pattern = os.path.join(dataset_dir, "%s-*" % split_name)
            tf_record_files = tf.gfile.Glob(file_pattern)
        elif self.data_name == "AVMP":
            split_path_map = dict({
                "train": "%s/train_db.txt" % dataset_dir,
                "valid": "%s/test_db.txt" % dataset_dir
            })
            split_name = self.config['split']
            image_file_list, label_file_list = [], []
            with open(split_path_map[split_name], 'rt') as f:
                for line in f:
                    image_file, label_file = line.split(" ")
                    image_file_list.append(dataset_dir + image_file.strip())
                    label_file_list.append(dataset_dir + label_file.strip())

            dataset = tf.data.Dataset.from_tensor_slices(
                (image_file_list, label_file_list))
            dataset = dataset.map(lambda x, y: _parse_function_avm(
                x, y, img_mean, self.class_dict),
                                  num_parallel_calls=threads)

        logging.info('preproces -- {}'.format(preprocess_name))
        if preprocess_name == 'augment':
            if random_mirror:
                dataset = dataset.map(_image_mirroring,
                                      num_parallel_calls=threads)
            if random_scale:
                dataset = dataset.map(_image_scaling,
                                      num_parallel_calls=threads)

            dataset = dataset.map(
                lambda x, y: _random_crop_and_pad_image_and_labels(
                    x, y, crop_h, crop_w),
                num_parallel_calls=threads)
            dataset = dataset.map(
                lambda image, label: _apply_with_random_selector(
                    image,
                    lambda x, ordering: _distort_color(
                        x, ordering, fast_mode=True),
                    num_cases=4,
                    label=label))

        dataset = dataset.map(
            lambda image, label: _check_size(image, label, crop_h, crop_w))
        dataset = dataset.shuffle(buffer_size=100)
        dataset = dataset.batch(batch_size)
        dataset = dataset.repeat()
        self.dataset = dataset
Ejemplo n.º 9
0
  def track(self, sess, first_bbox, frames, logdir='/tmp'):
    """Runs tracking on a single image sequence."""
    # Get initial target bounding box and convert to center based
    bbox = convert_bbox_format(first_bbox, 'center-based')

    # Feed in the first frame image to set initial state.
    bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
    input_feed = [frames[0], bbox_feed]
    frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

    # Storing target state
    original_target_height = bbox.height
    original_target_width = bbox.width
    search_center = np.array([get_center(self.x_image_size),
                              get_center(self.x_image_size)])
    current_target_state = TargetState(bbox=bbox,
                                       search_pos=search_center,
                                       scale_idx=int(get_center(self.num_scales)))

    include_first = get(self.track_config, 'include_first', False)
    logging.info('Tracking include first -- {}'.format(include_first))

    # Run tracking loop
    reported_bboxs = []
    for i, filename in enumerate(frames):
      if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
        bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                     current_target_state.bbox.height, current_target_state.bbox.width]
        input_feed = [filename, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
          response_max = np.max(response, axis=(1, 2))
          penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
          current_scale_idx = int(get_center(self.num_scales))
          penalties[current_scale_idx] = 1.0
          response_penalized = response_max * penalties
          best_scale = np.argmax(response_penalized)
        else:
          best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
          response = response - np.min(response)
          response = response / np.sum(response)

        if self.window is None:
          window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                          np.expand_dims(np.hanning(response_size), 0))
          self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(),
                                        response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
        # Position within frame in frame coordinates
        y = current_target_state.bbox.y
        x = current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = current_target_state.bbox.height / original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config['scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = original_target_height * target_scale
        width = original_target_width * target_scale
        current_target_state.bbox = Rectangle(x, y, width, height)
        current_target_state.scale_idx = best_scale
        current_target_state.search_pos = search_center + disp_instance_input

        assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'
        assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'

        if self.log_level > 0:
          np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

          # Select the image with the highest score scale and convert it to uint8
          image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
          # Note that imwrite in cv2 assumes the image is in BGR format.
          # However, the cropped image returned by TensorFlow is RGB.
          # Therefore, we convert color format using cv2.cvtColor
          imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                  cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

          np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
          np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

          y_search, x_search = current_target_state.search_pos
          search_scale = search_scale_list[best_scale]
          target_height_search = height * search_scale
          target_width_search = width * search_scale
          bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
          bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
          np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                  [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])

      reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based')
      reported_bboxs.append(reported_bbox)
    return reported_bboxs
def convolutional_alexnet_arg_scope(embed_config,
                                    trainable=True,
                                    is_training=False):
  """Defines the default arg scope.

  Args:
    embed_config: A dictionary which contains configurations for the embedding function.
    trainable: If the weights in the embedding function is trainable.
    is_training: If the embedding function is built for training.

  Returns:
    An `arg_scope` to use for the convolutional_alexnet models.
  """
  # Only consider the model to be in training mode if it's trainable.
  # This is vital for batch_norm since moving_mean and moving_variance
  # will get updated even if not trainable.
  is_model_training = trainable and is_training

  if get(embed_config, 'use_bn', True):
    batch_norm_scale = get(embed_config, 'bn_scale', True)
    batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4)
    batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6)
    batch_norm_params = {
      "scale": batch_norm_scale,
      # Decay for the moving averages.
      "decay": batch_norm_decay,
      # Epsilon to prevent 0s in variance.
      "epsilon": batch_norm_epsilon,
      "trainable": trainable,
      "is_training": is_model_training,
      # Collection containing the moving mean and moving variance.
      "variables_collections": {
        "beta": None,
        "gamma": None,
        "moving_mean": ["moving_vars"],
        "moving_variance": ["moving_vars"],
      },
      'updates_collections': None,  # Ensure that updates are done within a frame
    }
    normalizer_fn = slim.batch_norm
  else:
    batch_norm_params = {}
    normalizer_fn = None

  weight_decay = get(embed_config, 'weight_decay', 5e-4)
  if trainable:
    weights_regularizer = slim.l2_regularizer(weight_decay)
  else:
    weights_regularizer = None

  init_method = get(embed_config, 'init_method', 'kaiming_normal')
  if is_model_training:
    logging.info('embedding init method -- {}'.format(init_method))
  if init_method == 'kaiming_normal':
    # The same setting as siamese-fc
    initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False)
  else:
    initializer = slim.xavier_initializer()

  with slim.arg_scope(
      [slim.conv2d],
      weights_regularizer=weights_regularizer,
      weights_initializer=initializer,
      padding='VALID',
      trainable=trainable,
      activation_fn=tf.nn.relu,
      normalizer_fn=normalizer_fn,
      normalizer_params=batch_norm_params):
    with slim.arg_scope([slim.batch_norm], **batch_norm_params):
      with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc:
        return arg_sc
Ejemplo n.º 11
0
    def build_detection(self):
        self.embeds = self.get_image_embedding(
            self.search_images,
            reuse=True,
            is_example=False,
            sa_siam_config=self.model_config['sa_siam_config'])
        with tf.variable_scope('detection'):

            def _get_mask_any(shape_mask, _u, _d, _l, _r):
                _mask = np.zeros(shape_mask, dtype='float32')
                _mask[_u:_d, _l:_r] = 1.0
                return _mask

            def _get_center_mask(shape_mask,
                                 _sz):  # mask center a _sz x _sz patch
                _u = int((shape_mask[0] - _sz) / 2)
                _d = _u + _sz
                _l = int((shape_mask[1] - _sz) / 2)
                _r = _l + _sz
                return _get_mask_any(shape_mask, _u, _d, _l, _r)

            def _translation_match(x,
                                   z,
                                   mask_center=np.array([[1.0]],
                                                        dtype='float32')):
                x = tf.expand_dims(
                    x, 0)  # [batch, in_height, in_width, in_channels]
                z = tf.expand_dims(
                    z, -1
                )  # [filter_height, filter_width, in_channels, out_channels]
                mask_center = tf.expand_dims(mask_center, -1)
                mask_center = tf.expand_dims(mask_center, -1)
                return tf.nn.conv2d(x,
                                    z * mask_center,
                                    strides=[1, 1, 1, 1],
                                    padding='VALID',
                                    name='translation_match')

            logging.info('Shape of templates: {}'.format(self.templates.shape))
            logging.info('Shape of embeds: {}'.format(self.embeds.shape))
            en_appearance = get(self.model_config['sa_siam_config'],
                                'en_appearance', False)
            en_semantic = get(self.model_config['sa_siam_config'],
                              'en_semantic', False)
            if en_appearance and en_semantic:
                c_appearance = get(self.model_config['sa_siam_config'],
                                   'c_appearance', 0.3)
                out_scale = self.model_config['adjust_response_config'][
                    'scale']
                temp_appearance, temp_semantic = tf.split(self.templates, 2, 3)
                inst_appearance, inst_semantic = tf.split(self.embeds, 2, 3)
                bias_semantic = tf.get_variable(
                    'biases_semantic', [1],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(0.0, dtype=tf.float32),
                    trainable=False)
                bias_appearance = tf.get_variable(
                    'biases_appearance', [1],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(0.0, dtype=tf.float32),
                    trainable=False)
                sz_feat = shape_of(temp_appearance)[1:3]  # [h,w]
                self.mask_all = {'keep_all': 1 - _get_center_mask(sz_feat, 0)}
                self.response_all = {}
                for k in sorted(self.mask_all.keys()):
                    logging.info('Make match: {}'.format(k))
                    match_k = lambda x: _translation_match(
                        x[0], x[1], mask_center=self.mask_all[k])
                    out_appearance_mask_k = tf.map_fn(
                        match_k, (inst_appearance, temp_appearance),
                        dtype=inst_appearance.dtype)
                    out_semantic_mask_k = tf.map_fn(
                        match_k, (inst_semantic, temp_semantic),
                        dtype=inst_semantic.dtype)

                    out_appearance_mask_k = tf.squeeze(out_appearance_mask_k,
                                                       [1, 4])
                    out_semantic_mask_k = tf.squeeze(out_semantic_mask_k,
                                                     [1, 4])

                    response_appearance_mask_k = out_scale * out_appearance_mask_k
                    response_semantic_mask_k = out_scale * out_semantic_mask_k

                    self.response_all[k] = (
                        response_appearance_mask_k + bias_appearance
                    ) * c_appearance + (response_semantic_mask_k +
                                        bias_semantic) * (1 - c_appearance)
                response = self.response_all['keep_all']
            else:
                output = tf.map_fn(
                    lambda x: _translation_match(x[0], x[1]),
                    (self.embeds, self.templates),
                    dtype=self.embeds.dtype)  # of shape [16, 1, 17, 17, 1]
                output = tf.squeeze(output,
                                    [1, 4])  # of shape e.g. [16, 17, 17]
                bias = tf.get_variable('biases', [1],
                                       dtype=tf.float32,
                                       initializer=tf.constant_initializer(
                                           0.0, dtype=tf.float32),
                                       trainable=False)
                response = (
                    self.model_config['adjust_response_config']['scale'] *
                    output + bias)
            self.response = response
Ejemplo n.º 12
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        smooth_rate = self.track_config['smooth']
        update_interval = self.track_config['update_interval']
        feature_balance = self.track_config['feature_balance']

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)
        examplar = self.siamese_model.get_examplar(sess, input_feed)
        examplar_smooth = examplar
        st_template = []
        for i in range(self.siamese_model.train_config['time_range']):
            st_template.append(examplar)
        st_template_np = np.array(st_template)
        self.siamese_model.update_st_template_step(sess, st_template_np)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Set padding for refining search region
        img = mpimg.imread(frames[0])
        context_amount = self.track_config['context_amount']
        size_z = self.model_config['z_image_size']
        size_x = self.track_config['x_image_size']
        padding_h = 10
        padding_w = 10

        if original_target_height / original_target_width > 2:  #2
            padding_h = 1.4  #1.4
            padding_w = 6

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response2 = outputs['response2']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response2, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                response2 = response2[best_scale]
                response = feature_balance * response + (
                    1 - feature_balance) * response2
                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Refine the response
                base_z_size = np.array([
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ])
                base_z_context_size = base_z_size + context_amount * np.sum(
                    base_z_size)
                base_s_z = np.sqrt(
                    np.prod(base_z_context_size))  # Canonical size
                base_scale_z = size_z / base_s_z
                d_search = (size_x - size_z) / 2.0
                base_pad = d_search / base_scale_z
                base_s_x = base_s_z + 2 * base_pad

                if base_s_x / current_target_state.bbox.height > padding_h:
                    start_h = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.height * padding_h) /
                        (2 * base_s_x))
                    end_h = np.floor(response_size - start_h)
                    start_h = np.int(start_h)
                    end_h = np.int(end_h)
                    response[0:start_h, :] = 0
                    response[end_h:-1, :] = 0
                if base_s_x / current_target_state.bbox.width > padding_w:
                    start_w = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.width * padding_w) /
                        (2 * base_s_x))
                    end_w = np.floor(response_size - start_w)
                    start_w = np.int(start_w)
                    end_w = np.int(end_w)
                    response[:, :start_w] = 0
                    response[:, end_w:] = 0

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                # Update the spatial-temporal template using gcn
                if i % update_interval == 0:
                    bbox_feed = [
                        current_target_state.bbox.y,
                        current_target_state.bbox.x,
                        current_target_state.bbox.height,
                        current_target_state.bbox.width
                    ]
                    input_feed = [filename, bbox_feed]
                    current_examplar = self.siamese_model.get_examplar(
                        sess, input_feed)
                    # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:]
                    examplar_smooth = current_examplar
                    current_examplar = smooth_rate * examplar_smooth + (
                        1 - smooth_rate) * examplar
                    st_template.pop(1)
                    st_template.append(current_examplar)
                    st_template_np = np.array(st_template)
                    self.siamese_model.update_st_template_step(
                        sess, st_template_np)

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 13
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [
            frames[0], bbox_feed, self.x_image_size, self.search_factors
        ]
        frame2crop_scale, image_z = self.siamese_model.initialize(
            sess, input_feed)
        imwrite(osp.join(logdir, 'aimagez.jpg'),
                cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR))

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        image_c = None
        x_image_size = self.x_image_size
        lost = 0
        moved2border = False

        conf_thresh = 0.2  # 0.2
        bound_thresh = 0.2  # 0.2
        sup_thresh = 0.15  # 0.15
        prev_score = conf_thresh + 0.01
        upsample_factor = self.track_config['upsample_factor']
        search_factors = self.search_factors

        for i, filename in enumerate(frames):
            if i > 0 or include_first:
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]

                if prev_score > bound_thresh:
                    lost = 0
                else:
                    lost += 1

                if prev_score > 0.9:
                    self.siamese_model.update(sess, [
                        frames[i - 1], bbox_feed, self.x_image_size,
                        search_factors
                    ])

                with open(filename, 'rb') as f:
                    wi, hi = GetWidthAndHeight(f)
                t_i_ratio = max([
                    current_target_state.bbox.height / hi,
                    current_target_state.bbox.width / wi
                ])

                if prev_score < conf_thresh:
                    x_image_size += 100
                    #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init)
                    if t_i_ratio < 0.05:
                        x_image_size = min(x_image_size, 555)
                    elif t_i_ratio < 0.25:
                        x_image_size = min(x_image_size, 455)
                    elif t_i_ratio > 0.5:
                        x_image_size = min(x_image_size, 255)
                    else:
                        x_image_size = min(x_image_size, 355)
                else:
                    x_image_size = self.x_image_size

                if i > 1:
                    top = (current_target_state.bbox.y -
                           (current_target_state.bbox.height / 2) < 10)
                    left = (current_target_state.bbox.x -
                            (current_target_state.bbox.width / 2) < 10)
                    bottom = (current_target_state.bbox.y +
                              (current_target_state.bbox.height / 2) > hi - 10)
                    right = (current_target_state.bbox.x +
                             (current_target_state.bbox.width / 2) > wi - 10)
                    bound_flag = top or left or bottom or right
                    #if top or left or bottom or right:
                    #if not prev_score < bound_thresh:
                    #moved2border = True
                    #if not moved2border:
                    #current_target_state.bbox = Rectangle(wi / 2, hi / 2,
                    #current_target_state.bbox.width,
                    #current_target_state.bbox.height)
                    #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                    #current_target_state.bbox.height, current_target_state.bbox.width]
                    #else:
                    #if not prev_score < bound_thresh:
                    #moved2border = False

                if lost > 5 and bound_flag:
                    lost = 0
                    diffy = hi * 0.5 - bbox_feed[0]
                    diffx = wi * 0.5 - bbox_feed[1]
                    bbox_feed = [
                        diffy * 0.25 + bbox_feed[0],
                        diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3]
                    ]

                current_target_state.bbox = Rectangle(bbox_feed[1],
                                                      bbox_feed[0],
                                                      bbox_feed[3],
                                                      bbox_feed[2])

                input_feed = [
                    filename, bbox_feed, x_image_size, search_factors
                ]
                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]
                re_out = np.around(1 / (1 + np.exp(-response)), 2)

                if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5:
                    x_image_sizeb4 = x_image_size
                    x_image_size += 100
                    #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init
                    if t_i_ratio < 0.05:
                        x_image_size_l = 555
                    elif t_i_ratio < 0.25:
                        x_image_size_l = 455
                    elif t_i_ratio > 0.5:
                        x_image_size_l = 255
                    else:
                        x_image_size_l = 355

                    if not x_image_size > x_image_size_l:
                        input_feed = [
                            filename, bbox_feed, x_image_size, search_factors
                        ]
                        outputs, metadata = self.siamese_model.inference_step(
                            sess, input_feed)
                        search_scale_list = outputs['scale_xs']
                        response = outputs['response']
                        response_size = response.shape[1]
                        re_out = np.around(1 / (1 + np.exp(-response)), 2)
                    else:
                        x_image_size = x_image_sizeb4

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response * (re_out > sup_thresh),
                                          axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    if max(response_penalized) == 0.:
                        best_scale = 1
                    else:
                        best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                re_out = re_out[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)
                    response = response * (re_out > sup_thresh)

                window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                                np.expand_dims(np.hanning(response_size), 0))
                self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                if np.max(re_out) < sup_thresh:
                    r_max, c_max = response.shape
                    r_max, c_max = int(r_max / 2), int(c_max / 2)
                    disp_instance_input = [0, 0]
                    disp_instance_frame = [0, 0]
                else:
                    # Find maximum response
                    r_max, c_max = np.unravel_index(response.argmax(),
                                                    response.shape)

                    # Convert from crop-relative coordinates to frame coordinates
                    p_coor = np.array([r_max, c_max])
                    # displacement from the center in instance final representation ...
                    disp_instance_final = p_coor - get_center(response_size)
                    # ... in instance feature space ...
                    disp_instance_feat = disp_instance_final / upsample_factor
                    # ... Avoid empty position ...
                    r_radius = int(response_size / upsample_factor / 2)
                    disp_instance_feat = np.maximum(
                        np.minimum(disp_instance_feat, r_radius), -r_radius)
                    # ... in instance input ...
                    disp_instance_input = disp_instance_feat * self.model_config[
                        'embed_config']['stride']
                    # ... in instance original crop (in frame coordinates)
                    disp_instance_frame = disp_instance_input / search_scale_list[
                        best_scale]

                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]
                y = np.round(y)
                x = np.round(x)
                prev_score = re_out[r_max, c_max]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)

                # Some book keeping
                search_center = np.array(
                    [get_center(x_image_size),
                     get_center(x_image_size)])
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')

                    # Add score colormap
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    #im_shape = image_cropped.shape
                    #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride']
                    #pad = int((im_shape[0] - re_shape) / 2)
                    #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad])
                    #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant')
                    #response_crop = response_crop / response_crop.max()
                    #response_crop = np.uint8(response_crop * 255)
                    #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB)
                    #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0)

                    xmin = bbox_search.x.astype(np.int32)
                    ymin = bbox_search.y.astype(np.int32)
                    xmax = xmin + bbox_search.width.astype(np.int32)
                    ymax = ymin + bbox_search.height.astype(np.int32)
                    cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax),
                                  (255, 0, 0), 2)
                    text = str(prev_score)
                    cv2.putText(image_cropped,
                                text, (xmin, ymin),
                                cv2.FONT_HERSHEY_SIMPLEX,
                                1.0, (255, 0, 0),
                                lineType=cv2.LINE_AA)
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    #if image_c is not None:
                    #his_dir = logdir + '_his'
                    #if not osp.exists(his_dir):
                    #os.mkdir(his_dir)
                    #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0]
                    #image_c_p = np.uint8(image_c_p)
                    #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)),
                    #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR))

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Ejemplo n.º 14
0
  def track(self, sess, first_bbox, frames, logdir='/tmp'):
    """Runs tracking on a single image sequence."""
    # Get initial target bounding box and convert to center based
    bbox = convert_bbox_format(first_bbox, 'center-based')
    print(frames)
    # Feed in the first frame image to set initial state.
    bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
    input_feed = [frames[0], bbox_feed]
    frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

    # Storing target state
    original_target_height = bbox.height
    original_target_width = bbox.width
    search_center = np.array([get_center(self.x_image_size),
                              get_center(self.x_image_size)])
    current_target_state = TargetState(bbox=bbox,
                                       search_pos=search_center,
                                       scale_idx=int(get_center(self.num_scales)))

    include_first = get(self.track_config, 'include_first', False)
    logging.info('Tracking include first -- {}'.format(include_first))

    # Run tracking loop
    reported_bboxs = []
    output_json={} #dump all bboxes in this output file

    for i, filename in enumerate(frames):
      if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
        bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                     current_target_state.bbox.height, current_target_state.bbox.width]
        input_feed = [filename, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
          response_max = np.max(response, axis=(1, 2))
          penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
          current_scale_idx = int(get_center(self.num_scales))
          penalties[current_scale_idx] = 1.0
          response_penalized = response_max * penalties
          best_scale = np.argmax(response_penalized)
        else:
          best_scale = 0

        response = response[best_scale]
        #print(response)
        

        with np.errstate(all='raise'):  # Raise error if something goes wrong
          response = response - np.min(response)
          response = response / np.sum(response)

        if self.window is None:
          window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                          np.expand_dims(np.hanning(response_size), 0))
          self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        srtd=response.argsort(axis=None)
        v =  response.argmax()
        r_max, c_max = np.unravel_index(v,
                                        response.shape)


        if not osp.exists(osp.join(logdir,"Intermediate")):
          os.mkdir(osp.join(logdir,"Intermediate"))

        to_save = np.interp(response,(response.min(),response.max()),(0,255))
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save)
        
        to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1)
        ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY)
        
        
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1)
        image = np.uint8(thresh1.copy())
        
        cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
        cnts = imutils.grab_contours(cnts)
        backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
        image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2)
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image)
        
        centres=[]
        for c in cnts:
          M = cv2.moments(c)
          cX = int(M["m10"] / M["m00"])
          cY = int(M["m01"] / M["m00"])
          centres.append((cY,cX,False))
        centres.append((r_max,c_max,True))
        #print(centres)

        #cts_copy = copy(current_target_state)
        #cts_copy2 = copy(current_target_state)
        output_json[filename]=[]

        for (r_max,c_max,to_deep_copy) in centres:
          if to_deep_copy:
            cts_copy = deepcopy(current_target_state)
          else:
            cts_copy = copy(current_target_state)
          # Convert from crop-relative coordinates to frame coordinates
          p_coor = np.array([r_max, c_max])
          # displacement from the center in instance final representation ...
          disp_instance_final = p_coor - get_center(response_size)
          # ... in instance feature space ...
          upsample_factor = self.track_config['upsample_factor']
          disp_instance_feat = disp_instance_final / upsample_factor
          # ... Avoid empty position ...
          r_radius = int(response_size / upsample_factor / 2)
          disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
          # ... in instance input ...
          disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
          # ... in instance original crop (in frame coordinates)
          disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
          # Position within frame in frame coordinates
          y = cts_copy.bbox.y
          x = cts_copy.bbox.x
          y += disp_instance_frame[0]
          x += disp_instance_frame[1]

          # Target scale damping and saturation
          target_scale = cts_copy.bbox.height / original_target_height
          search_factor = self.search_factors[best_scale]
          scale_damp = self.track_config['scale_damp']  # damping factor for scale update
          target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
          target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

          # Some book keeping
          height = original_target_height * target_scale
          width = original_target_width * target_scale
          
          cts_copy.bbox = Rectangle(x, y, width, height)
          cts_copy.scale_idx = best_scale
          cts_copy.search_pos = search_center + disp_instance_input

          assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'
          assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'

          if self.log_level > 0 and to_deep_copy:
            np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
            # Note that imwrite in cv2 assumes the image is in BGR format.
            # However, the cropped image returned by TensorFlow is RGB.
            # Therefore, we convert color format using cv2.cvtColor
            imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                    cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

            np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
            np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

            y_search, x_search = cts_copy.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
            np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                    [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])

          reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based')
          #print(f"reported bbox {reported_bbox}")
          if to_deep_copy:
            reported_bboxs.append(reported_bbox)
          else:
            rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1,
                                              reported_bbox.width, reported_bbox.height)
            arr = output_json[filename]
            arr.append(rect_str)


    
    with open(osp.join(logdir,'bboxes.json'),'w') as f:
      json.dump(output_json,f,indent=4)
    return reported_bboxs