def prepare_data(self): # Parameter prepare dataset_dir = self.dataSet_dir input_dir = self.config['input_dir'] output_dir = self.config['output_dir'] crop_h = self.config['crop_h'] crop_w = self.config['crop_w'] threads = self.config['prefetch_threads'] img_mean = get(self.config, 'img_mean', None) preprocess_name = get(self.config, 'preprocessing_name', None) random_scale = get(self.config, 'random_scale', False) random_mirror = get(self.config, 'random_mirror', True) batch_size = get(self.config, 'batch_size', 8) input_names = [] output_names = [] for file in os.listdir(osp.join(dataset_dir, input_dir)): cwd = os.getcwd() input_names.append(cwd + "/" + osp.join(dataset_dir, input_dir) + "/" + file) for file in os.listdir(osp.join(dataset_dir, output_dir)): cwd = os.getcwd() output_names.append(cwd + "/" + osp.join(dataset_dir, output_dir) + "/" + file) input_names.sort(), output_names.sort() dataset = tf.data.Dataset.from_tensor_slices( (input_names, output_names)) dataset = dataset.map( lambda x, y: _parse_function(x, y, img_mean, self.class_dict), num_parallel_calls=threads) logging.info('preproces -- {}'.format(preprocess_name)) if preprocess_name == 'augment': if random_mirror: dataset = dataset.map(_image_mirroring, num_parallel_calls=threads) if random_scale: dataset = dataset.map(_image_scaling, num_parallel_calls=threads) dataset = dataset.map( lambda x, y: _random_crop_and_pad_image_and_labels( x, y, crop_h, crop_w), num_parallel_calls=threads) dataset = dataset.map( lambda image, label: _apply_with_random_selector( image, lambda x, ordering: _distort_color( x, ordering, fast_mode=True), num_cases=4, label=label)) dataset = dataset.map( lambda image, label: _check_size(image, label, crop_h, crop_w)) dataset = dataset.shuffle(buffer_size=100) dataset = dataset.batch(batch_size) dataset = dataset.repeat() self.dataset = dataset
def __init__(self, config, is_training): self.config = config self.is_training = is_training preprocess_name = get(config, 'preprocessing_name', None) logging.info('preproces -- {}'.format(preprocess_name)) if preprocess_name == 'siamese_fc_color': self.v_transform = None # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ? self.z_transform = Compose([RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), CenterCrop((127, 127))]) self.x_transform = Compose([RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), ]) elif preprocess_name == 'siamese_fc_gray': self.v_transform = RandomGray() self.z_transform = Compose([RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), CenterCrop((127, 127))]) self.x_transform = Compose([RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), ]) elif preprocess_name == 'None': self.v_transform = None self.z_transform = CenterCrop((127, 127)) self.x_transform = CenterCrop((255, 255)) else: raise ValueError('Preprocessing name {} was not recognized.'.format(preprocess_name)) self.dataset_py = VID(config['input_imdb'], config['max_frame_dist']) self.sampler = Sampler(self.dataset_py, shuffle=is_training)
def convolutional_alexnet_gn_arg_scope(embed_config, trainable=True, is_training=True): is_model_training = trainable and is_training if get(embed_config, 'use_gn', True): norm_params = { "trainable": trainable, } normalizer_fn = group_norm else: norm_params = {} normalizer_fn = None weight_decay = get(embed_config, 'weight_decay', 1e-4) if trainable: weights_regularizer = slim.l2_regularizer(weight_decay) else: weights_regularizer = None init_method = get(embed_config, 'init_method', 'kaiming_normal') if is_model_training: logging.info('embedding init method -- {}'.format(init_method)) if init_method == 'kaiming_normal': # The same setting as siamese-fc initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False) else: initializer = slim.xavier_initializer() with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer, weights_initializer=initializer, padding='VALID', trainable=trainable, activation_fn=tf.nn.relu, normalizer_fn=normalizer_fn, normalizer_params=norm_params): with slim.arg_scope([group_norm], **norm_params): with slim.arg_scope([group_norm]) as arg_sc: return arg_sc
def __init__(self, config, is_training): self.config = config self.is_training = is_training preprocess_name = get(config, 'preprocessing_name', None) logging.info('preproces -- {}'.format(preprocess_name)) if preprocess_name == 'siamese_fc_color': self.v_transform = None # TODO: use a single operation (tf.image.crop_and_resize) to achieve all transformations ? self.z_transform = Compose([ RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), CenterCrop((127, 127)) ]) self.x_transform = Compose([ RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), ]) elif preprocess_name == 'siamese_fc_gray': self.v_transform = RandomGray() self.z_transform = Compose([ RandomStretch(), CenterCrop( (255 - 8, 255 - 8) ), # embeding stride: 8, robustness: should also be center RandomCrop(255 - 2 * 8), CenterCrop((127, 127)) ]) self.x_transform = Compose([ RandomStretch(), CenterCrop((255 - 8, 255 - 8)), RandomCrop(255 - 2 * 8), ]) elif preprocess_name == 'None': self.v_transform = None self.z_transform = CenterCrop((127, 127)) self.x_transform = CenterCrop((255, 255)) else: raise ValueError( 'Preprocessing name {} was not recognized.'.format( preprocess_name)) self.dataset_py = VID(config['input_imdb'], config['max_frame_dist']) self.sampler = Sampler(self.dataset_py, shuffle=is_training)
def sa_siam_arg_scope(embed_config, trainable=True, is_training=False): """Defines the default arg scope. Args: embed_config: A dictionary which contains configurations for the embedding function. trainable: If the weights in the embedding function is trainable. is_training: If the embedding function is built for training. Returns: An `arg_scope` to use for the SA-Siam models. """ # Only consider the model to be in training mode if it's trainable. # This is vital for batch_norm since moving_mean and moving_variance # will get updated even if not trainable. is_model_training = trainable and is_training if get(embed_config, 'use_bn', True): batch_norm_scale = get(embed_config, 'bn_scale', True) batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4) batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6) batch_norm_params = { "scale": batch_norm_scale, # Decay for the moving averages. "decay": batch_norm_decay, # Epsilon to prevent 0s in variance. "epsilon": batch_norm_epsilon, "trainable": trainable, "is_training": is_model_training, # Collection containing the moving mean and moving variance. "variables_collections": { "beta": None, "gamma": None, "moving_mean": ["moving_vars"], "moving_variance": ["moving_vars"], }, 'updates_collections': None, # Ensure that updates are done within a frame } normalizer_fn = slim.batch_norm else: batch_norm_params = {} normalizer_fn = None weight_decay = get(embed_config, 'weight_decay', 5e-4) if trainable: weights_regularizer = slim.l2_regularizer(weight_decay) else: weights_regularizer = None init_method = get(embed_config, 'init_method', None) if is_model_training: logging.info('embedding init method -- {}'.format(init_method)) if init_method == 'kaiming_normal': # The same setting as siamese-fc initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False) else: initializer = slim.xavier_initializer() with slim.arg_scope( [slim.conv2d], weights_regularizer=weights_regularizer, weights_initializer=initializer, padding='VALID', trainable=trainable, activation_fn=tf.nn.relu, normalizer_fn=normalizer_fn, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc: return arg_sc
def sa_siam(inputs, is_example, sa_siam_config={}, reuse=None, scope='sa_siam'): en_appearance = get(sa_siam_config, 'en_appearance', False) en_semantic = get(sa_siam_config, 'en_semantic', False) n_out = get(sa_siam_config, 'n_out', 256) all_combine_layers_appearance = get(sa_siam_config, 'all_combine_layers_appearance', {'conv5':1.0}) all_combine_layers_semantic = get(sa_siam_config, 'all_combine_layers_semantic', {'conv5':1.0, 'conv4':0.1}) sz_conv5_z = get(sa_siam_config, 'sz_conv5_z', 6) en_semantic_att = get(sa_siam_config, 'en_semantic_att', True) with tf.variable_scope(scope, 'sa_siam', [inputs], reuse=reuse) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, slim.max_pool2d], outputs_collections=end_points_collection): def proc_raw_all_feat(feat, is_appearance, n_out_cur, all_combine_layers): res = [] max_feat_size = 0 for l in range(1,6): for k in all_combine_layers.keys(): if k.find(str(l)) != -1: if shape_of(feat[l-1])[3] is None: res.append(feat[l-1]) break if l == 5 and is_appearance and abs(n_out_cur - shape_of(feat[l-1])[3]) < 0.1: res.append(feat[l-1]) else: if not is_appearance: feat[l-1] *= all_combine_layers[k] # Multiple scale for convergence during training with slim.arg_scope([slim.conv2d],activation_fn=None, normalizer_fn=None): c1x1 = slim.conv2d(feat[l-1], n_out_cur, [1,1], 1, scope='c1x1_' + k) res.append(c1x1) logging.info('Keep {} .. is_appearance={} shape={}'.format(k,is_appearance,shape_of(res[-1]))) return res def re_weight_crop(feat, all_combine_layers, only_crop=False): feat_shape = list(map(shape_of, feat)) res = [] for l in range(1,6): # proc layers from 1 to 5 in order for k in all_combine_layers.keys(): # find the corresponding layer in all layers if k.find(str(l)) != -1: logging.info('For layer {} ...'.format(k)) cur_ly_idx = l - 1 if feat_shape[cur_ly_idx][2] is None and feat_shape[4][2] is None: res.append(feat[cur_ly_idx]) break pad_val = feat_shape[cur_ly_idx][2] - feat_shape[4][2] sz_conv5_z_cur = pad_val + sz_conv5_z sz_conv5_x_cur = feat_shape[cur_ly_idx][2] n_left = int((sz_conv5_x_cur - sz_conv5_z_cur) / 2 + 0.5) div_left_st = [0, n_left, n_left + sz_conv5_z_cur, sz_conv5_x_cur] logging.info('.. Crop as {}'.format(div_left_st)) # crop 9 patchs and max pool each patch if not only_crop: all_max = [] for j in [0,1,2]: for i in [0,1,2]: l_crop = div_left_st[i] r_crop = div_left_st[i + 1] u_crop = div_left_st[j] d_crop = div_left_st[j+1] max_patch = tf.reduce_max(feat[cur_ly_idx][:, u_crop:d_crop, l_crop:r_crop, :], axis=[1, 2]) #shape = [n, c] all_max.append(max_patch) max_map = tf.stack(all_max, axis=2) #shape = [n, c, 9] logging.info('.. Max_map.shape = {}'.format(max_map.shape)) max_map = slim.fully_connected(max_map, 9, scope='att_fc1_' + k) # fully_connected layer will only applied to the last dim logging.info('.. Max_map_fc1.shape = {}'.format(max_map.shape)) max_map = slim.fully_connected(max_map, 1, scope='att_fc2_' + k, activation_fn=None, normalizer_fn=None,) logging.info('.. Max_map_fc2.shape = {}'.format(max_map.shape)) # shape = [n, c, 1] att_map = tf.reshape(max_map, [-1, 1, 1, feat_shape[cur_ly_idx][3]]) logging.info('.. att_map.shape = {}'.format(att_map.shape)) att_map = tf.sigmoid(att_map) + 0.5 # important bias for avoiding loss too much feat[cur_ly_idx] = att_map * feat[cur_ly_idx] feat[cur_ly_idx] = feat[cur_ly_idx][:, div_left_st[1]:div_left_st[2], div_left_st[1]:div_left_st[2], :] # crop center feat res.append(feat[cur_ly_idx]) break else: res.append(None) return res layer_cur = inputs if en_appearance: n_out_appearance = n_out / len(all_combine_layers_appearance.keys()) with tf.variable_scope('appearance_net'): _, feat_appearance_all = appearance_net(layer_cur) if is_example: feat_appearance_all = re_weight_crop(feat_appearance_all, all_combine_layers_appearance, only_crop=True) net_appearance = proc_raw_all_feat(feat_appearance_all, is_appearance=True, n_out_cur=n_out_appearance, all_combine_layers=all_combine_layers_appearance) if en_semantic: n_out_semantic = n_out / len(all_combine_layers_semantic.keys()) with tf.variable_scope('semantic_net'): _, feat_semantic_all = semantic_net(layer_cur) if is_example: feat_semantic_all = re_weight_crop(feat_semantic_all, all_combine_layers_semantic, only_crop=not en_semantic_att) net_semantic = proc_raw_all_feat(feat_semantic_all, is_appearance=False, n_out_cur=n_out_semantic, all_combine_layers=all_combine_layers_semantic) if en_appearance and en_semantic: layer_cur = combine_sa_net(net_appearance, net_semantic) elif en_appearance:layer_cur = combine_sa_net(net_appearance, []) elif en_semantic:layer_cur = combine_sa_net(net_semantic, []) else: raise ValueError('Semantic or Appearance must enable one branch!') # Convert end_points_collection into a dictionary of end_points. end_points = slim.utils.convert_collection_to_dict(end_points_collection) return layer_cur, end_points
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) if np.max(response_max) < 0: logging.warning('MAX_RESPONSE LESS THAN ZERO!') # best_scale = current_scale_idx else: best_scale = 0 response = response[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def prepare_data(self): # Parameter prepare dataset_dir = self.dataSet_dir crop_h = self.config['crop_h'] crop_w = self.config['crop_w'] threads = self.config['prefetch_threads'] img_mean = get(self.config, 'img_mean', None) preprocess_name = get(self.config, 'preprocessing_name', None) random_scale = get(self.config, 'random_scale', False) random_mirror = get(self.config, 'random_mirror', True) batch_size = get(self.config, 'batch_size', 8) batch_size //= get(self.config, 'num_gpus', 1) if self.data_name == "CamVid": input_dir = self.config['input_dir'] output_dir = self.config['output_dir'] input_names = [] output_names = [] for file in os.listdir(osp.join(dataset_dir, input_dir)): cwd = os.getcwd() input_names.append(cwd + "/" + osp.join(dataset_dir, input_dir) + "/" + file) for file in os.listdir(osp.join(dataset_dir, output_dir)): cwd = os.getcwd() output_names.append(cwd + "/" + osp.join(dataset_dir, output_dir) + "/" + file) input_names.sort(), output_names.sort() dataset = tf.data.Dataset.from_tensor_slices( (input_names, output_names)) dataset = dataset.map( lambda x, y: _parse_function(x, y, img_mean, self.class_dict), num_parallel_calls=threads) elif self.data_name == "MVD": split_name = self.config['split'] file_pattern = os.path.join(dataset_dir, "%s-*" % split_name) tf_record_files = tf.gfile.Glob(file_pattern) elif self.data_name == "AVMP": split_path_map = dict({ "train": "%s/train_db.txt" % dataset_dir, "valid": "%s/test_db.txt" % dataset_dir }) split_name = self.config['split'] image_file_list, label_file_list = [], [] with open(split_path_map[split_name], 'rt') as f: for line in f: image_file, label_file = line.split(" ") image_file_list.append(dataset_dir + image_file.strip()) label_file_list.append(dataset_dir + label_file.strip()) dataset = tf.data.Dataset.from_tensor_slices( (image_file_list, label_file_list)) dataset = dataset.map(lambda x, y: _parse_function_avm( x, y, img_mean, self.class_dict), num_parallel_calls=threads) logging.info('preproces -- {}'.format(preprocess_name)) if preprocess_name == 'augment': if random_mirror: dataset = dataset.map(_image_mirroring, num_parallel_calls=threads) if random_scale: dataset = dataset.map(_image_scaling, num_parallel_calls=threads) dataset = dataset.map( lambda x, y: _random_crop_and_pad_image_and_labels( x, y, crop_h, crop_w), num_parallel_calls=threads) dataset = dataset.map( lambda image, label: _apply_with_random_selector( image, lambda x, ordering: _distort_color( x, ordering, fast_mode=True), num_cases=4, label=label)) dataset = dataset.map( lambda image, label: _check_size(image, label, crop_h, crop_w)) dataset = dataset.shuffle(buffer_size=100) dataset = dataset.batch(batch_size) dataset = dataset.repeat() self.dataset = dataset
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config['scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def convolutional_alexnet_arg_scope(embed_config, trainable=True, is_training=False): """Defines the default arg scope. Args: embed_config: A dictionary which contains configurations for the embedding function. trainable: If the weights in the embedding function is trainable. is_training: If the embedding function is built for training. Returns: An `arg_scope` to use for the convolutional_alexnet models. """ # Only consider the model to be in training mode if it's trainable. # This is vital for batch_norm since moving_mean and moving_variance # will get updated even if not trainable. is_model_training = trainable and is_training if get(embed_config, 'use_bn', True): batch_norm_scale = get(embed_config, 'bn_scale', True) batch_norm_decay = 1 - get(embed_config, 'bn_momentum', 3e-4) batch_norm_epsilon = get(embed_config, 'bn_epsilon', 1e-6) batch_norm_params = { "scale": batch_norm_scale, # Decay for the moving averages. "decay": batch_norm_decay, # Epsilon to prevent 0s in variance. "epsilon": batch_norm_epsilon, "trainable": trainable, "is_training": is_model_training, # Collection containing the moving mean and moving variance. "variables_collections": { "beta": None, "gamma": None, "moving_mean": ["moving_vars"], "moving_variance": ["moving_vars"], }, 'updates_collections': None, # Ensure that updates are done within a frame } normalizer_fn = slim.batch_norm else: batch_norm_params = {} normalizer_fn = None weight_decay = get(embed_config, 'weight_decay', 5e-4) if trainable: weights_regularizer = slim.l2_regularizer(weight_decay) else: weights_regularizer = None init_method = get(embed_config, 'init_method', 'kaiming_normal') if is_model_training: logging.info('embedding init method -- {}'.format(init_method)) if init_method == 'kaiming_normal': # The same setting as siamese-fc initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_OUT', uniform=False) else: initializer = slim.xavier_initializer() with slim.arg_scope( [slim.conv2d], weights_regularizer=weights_regularizer, weights_initializer=initializer, padding='VALID', trainable=trainable, activation_fn=tf.nn.relu, normalizer_fn=normalizer_fn, normalizer_params=batch_norm_params): with slim.arg_scope([slim.batch_norm], **batch_norm_params): with slim.arg_scope([slim.batch_norm], is_training=is_model_training) as arg_sc: return arg_sc
def build_detection(self): self.embeds = self.get_image_embedding( self.search_images, reuse=True, is_example=False, sa_siam_config=self.model_config['sa_siam_config']) with tf.variable_scope('detection'): def _get_mask_any(shape_mask, _u, _d, _l, _r): _mask = np.zeros(shape_mask, dtype='float32') _mask[_u:_d, _l:_r] = 1.0 return _mask def _get_center_mask(shape_mask, _sz): # mask center a _sz x _sz patch _u = int((shape_mask[0] - _sz) / 2) _d = _u + _sz _l = int((shape_mask[1] - _sz) / 2) _r = _l + _sz return _get_mask_any(shape_mask, _u, _d, _l, _r) def _translation_match(x, z, mask_center=np.array([[1.0]], dtype='float32')): x = tf.expand_dims( x, 0) # [batch, in_height, in_width, in_channels] z = tf.expand_dims( z, -1 ) # [filter_height, filter_width, in_channels, out_channels] mask_center = tf.expand_dims(mask_center, -1) mask_center = tf.expand_dims(mask_center, -1) return tf.nn.conv2d(x, z * mask_center, strides=[1, 1, 1, 1], padding='VALID', name='translation_match') logging.info('Shape of templates: {}'.format(self.templates.shape)) logging.info('Shape of embeds: {}'.format(self.embeds.shape)) en_appearance = get(self.model_config['sa_siam_config'], 'en_appearance', False) en_semantic = get(self.model_config['sa_siam_config'], 'en_semantic', False) if en_appearance and en_semantic: c_appearance = get(self.model_config['sa_siam_config'], 'c_appearance', 0.3) out_scale = self.model_config['adjust_response_config'][ 'scale'] temp_appearance, temp_semantic = tf.split(self.templates, 2, 3) inst_appearance, inst_semantic = tf.split(self.embeds, 2, 3) bias_semantic = tf.get_variable( 'biases_semantic', [1], dtype=tf.float32, initializer=tf.constant_initializer(0.0, dtype=tf.float32), trainable=False) bias_appearance = tf.get_variable( 'biases_appearance', [1], dtype=tf.float32, initializer=tf.constant_initializer(0.0, dtype=tf.float32), trainable=False) sz_feat = shape_of(temp_appearance)[1:3] # [h,w] self.mask_all = {'keep_all': 1 - _get_center_mask(sz_feat, 0)} self.response_all = {} for k in sorted(self.mask_all.keys()): logging.info('Make match: {}'.format(k)) match_k = lambda x: _translation_match( x[0], x[1], mask_center=self.mask_all[k]) out_appearance_mask_k = tf.map_fn( match_k, (inst_appearance, temp_appearance), dtype=inst_appearance.dtype) out_semantic_mask_k = tf.map_fn( match_k, (inst_semantic, temp_semantic), dtype=inst_semantic.dtype) out_appearance_mask_k = tf.squeeze(out_appearance_mask_k, [1, 4]) out_semantic_mask_k = tf.squeeze(out_semantic_mask_k, [1, 4]) response_appearance_mask_k = out_scale * out_appearance_mask_k response_semantic_mask_k = out_scale * out_semantic_mask_k self.response_all[k] = ( response_appearance_mask_k + bias_appearance ) * c_appearance + (response_semantic_mask_k + bias_semantic) * (1 - c_appearance) response = self.response_all['keep_all'] else: output = tf.map_fn( lambda x: _translation_match(x[0], x[1]), (self.embeds, self.templates), dtype=self.embeds.dtype) # of shape [16, 1, 17, 17, 1] output = tf.squeeze(output, [1, 4]) # of shape e.g. [16, 17, 17] bias = tf.get_variable('biases', [1], dtype=tf.float32, initializer=tf.constant_initializer( 0.0, dtype=tf.float32), trainable=False) response = ( self.model_config['adjust_response_config']['scale'] * output + bias) self.response = response
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') smooth_rate = self.track_config['smooth'] update_interval = self.track_config['update_interval'] feature_balance = self.track_config['feature_balance'] # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) examplar = self.siamese_model.get_examplar(sess, input_feed) examplar_smooth = examplar st_template = [] for i in range(self.siamese_model.train_config['time_range']): st_template.append(examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step(sess, st_template_np) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Set padding for refining search region img = mpimg.imread(frames[0]) context_amount = self.track_config['context_amount'] size_z = self.model_config['z_image_size'] size_x = self.track_config['x_image_size'] padding_h = 10 padding_w = 10 if original_target_height / original_target_width > 2: #2 padding_h = 1.4 #1.4 padding_w = 6 # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response2 = outputs['response2'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response2, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] response2 = response2[best_scale] response = feature_balance * response + ( 1 - feature_balance) * response2 with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Refine the response base_z_size = np.array([ current_target_state.bbox.height, current_target_state.bbox.width ]) base_z_context_size = base_z_size + context_amount * np.sum( base_z_size) base_s_z = np.sqrt( np.prod(base_z_context_size)) # Canonical size base_scale_z = size_z / base_s_z d_search = (size_x - size_z) / 2.0 base_pad = d_search / base_scale_z base_s_x = base_s_z + 2 * base_pad if base_s_x / current_target_state.bbox.height > padding_h: start_h = np.ceil( response_size * (base_s_x - current_target_state.bbox.height * padding_h) / (2 * base_s_x)) end_h = np.floor(response_size - start_h) start_h = np.int(start_h) end_h = np.int(end_h) response[0:start_h, :] = 0 response[end_h:-1, :] = 0 if base_s_x / current_target_state.bbox.width > padding_w: start_w = np.ceil( response_size * (base_s_x - current_target_state.bbox.width * padding_w) / (2 * base_s_x)) end_w = np.floor(response_size - start_w) start_w = np.int(start_w) end_w = np.int(end_w) response[:, :start_w] = 0 response[:, end_w:] = 0 # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input # Update the spatial-temporal template using gcn if i % update_interval == 0: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] current_examplar = self.siamese_model.get_examplar( sess, input_feed) # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:] examplar_smooth = current_examplar current_examplar = smooth_rate * examplar_smooth + ( 1 - smooth_rate) * examplar st_template.pop(1) st_template.append(current_examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step( sess, st_template_np) assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [ frames[0], bbox_feed, self.x_image_size, self.search_factors ] frame2crop_scale, image_z = self.siamese_model.initialize( sess, input_feed) imwrite(osp.join(logdir, 'aimagez.jpg'), cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR)) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] image_c = None x_image_size = self.x_image_size lost = 0 moved2border = False conf_thresh = 0.2 # 0.2 bound_thresh = 0.2 # 0.2 sup_thresh = 0.15 # 0.15 prev_score = conf_thresh + 0.01 upsample_factor = self.track_config['upsample_factor'] search_factors = self.search_factors for i, filename in enumerate(frames): if i > 0 or include_first: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] if prev_score > bound_thresh: lost = 0 else: lost += 1 if prev_score > 0.9: self.siamese_model.update(sess, [ frames[i - 1], bbox_feed, self.x_image_size, search_factors ]) with open(filename, 'rb') as f: wi, hi = GetWidthAndHeight(f) t_i_ratio = max([ current_target_state.bbox.height / hi, current_target_state.bbox.width / wi ]) if prev_score < conf_thresh: x_image_size += 100 #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init) if t_i_ratio < 0.05: x_image_size = min(x_image_size, 555) elif t_i_ratio < 0.25: x_image_size = min(x_image_size, 455) elif t_i_ratio > 0.5: x_image_size = min(x_image_size, 255) else: x_image_size = min(x_image_size, 355) else: x_image_size = self.x_image_size if i > 1: top = (current_target_state.bbox.y - (current_target_state.bbox.height / 2) < 10) left = (current_target_state.bbox.x - (current_target_state.bbox.width / 2) < 10) bottom = (current_target_state.bbox.y + (current_target_state.bbox.height / 2) > hi - 10) right = (current_target_state.bbox.x + (current_target_state.bbox.width / 2) > wi - 10) bound_flag = top or left or bottom or right #if top or left or bottom or right: #if not prev_score < bound_thresh: #moved2border = True #if not moved2border: #current_target_state.bbox = Rectangle(wi / 2, hi / 2, #current_target_state.bbox.width, #current_target_state.bbox.height) #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, #current_target_state.bbox.height, current_target_state.bbox.width] #else: #if not prev_score < bound_thresh: #moved2border = False if lost > 5 and bound_flag: lost = 0 diffy = hi * 0.5 - bbox_feed[0] diffx = wi * 0.5 - bbox_feed[1] bbox_feed = [ diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3] ] current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0], bbox_feed[3], bbox_feed[2]) input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5: x_image_sizeb4 = x_image_size x_image_size += 100 #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init if t_i_ratio < 0.05: x_image_size_l = 555 elif t_i_ratio < 0.25: x_image_size_l = 455 elif t_i_ratio > 0.5: x_image_size_l = 255 else: x_image_size_l = 355 if not x_image_size > x_image_size_l: input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) else: x_image_size = x_image_sizeb4 # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response * (re_out > sup_thresh), axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties if max(response_penalized) == 0.: best_scale = 1 else: best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] re_out = re_out[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) response = response * (re_out > sup_thresh) window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window if np.max(re_out) < sup_thresh: r_max, c_max = response.shape r_max, c_max = int(r_max / 2), int(c_max / 2) disp_instance_input = [0, 0] disp_instance_frame = [0, 0] else: # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] y = np.round(y) x = np.round(x) prev_score = re_out[r_max, c_max] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) # Some book keeping search_center = np.array( [get_center(x_image_size), get_center(x_image_size)]) height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') # Add score colormap image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) #im_shape = image_cropped.shape #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride'] #pad = int((im_shape[0] - re_shape) / 2) #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad]) #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant') #response_crop = response_crop / response_crop.max() #response_crop = np.uint8(response_crop * 255) #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB) #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0) xmin = bbox_search.x.astype(np.int32) ymin = bbox_search.y.astype(np.int32) xmax = xmin + bbox_search.width.astype(np.int32) ymax = ymin + bbox_search.height.astype(np.int32) cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2) text = str(prev_score) cv2.putText(image_cropped, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), lineType=cv2.LINE_AA) imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) #if image_c is not None: #his_dir = logdir + '_his' #if not osp.exists(his_dir): #os.mkdir(his_dir) #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0] #image_c_p = np.uint8(image_c_p) #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)), #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR)) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') print(frames) # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] output_json={} #dump all bboxes in this output file for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] #print(response) with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence) * response + window_influence * self.window # Find maximum response srtd=response.argsort(axis=None) v = response.argmax() r_max, c_max = np.unravel_index(v, response.shape) if not osp.exists(osp.join(logdir,"Intermediate")): os.mkdir(osp.join(logdir,"Intermediate")) to_save = np.interp(response,(response.min(),response.max()),(0,255)) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save) to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1) ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1) image = np.uint8(thresh1.copy()) cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB) image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image) centres=[] for c in cnts: M = cv2.moments(c) cX = int(M["m10"] / M["m00"]) cY = int(M["m01"] / M["m00"]) centres.append((cY,cX,False)) centres.append((r_max,c_max,True)) #print(centres) #cts_copy = copy(current_target_state) #cts_copy2 = copy(current_target_state) output_json[filename]=[] for (r_max,c_max,to_deep_copy) in centres: if to_deep_copy: cts_copy = deepcopy(current_target_state) else: cts_copy = copy(current_target_state) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = cts_copy.bbox.y x = cts_copy.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = cts_copy.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config['scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale cts_copy.bbox = Rectangle(x, y, width, height) cts_copy.scale_idx = best_scale cts_copy.search_pos = search_center + disp_instance_input assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0 and to_deep_copy: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = cts_copy.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based') #print(f"reported bbox {reported_bbox}") if to_deep_copy: reported_bboxs.append(reported_bbox) else: rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1, reported_bbox.width, reported_bbox.height) arr = output_json[filename] arr.append(rect_str) with open(osp.join(logdir,'bboxes.json'),'w') as f: json.dump(output_json,f,indent=4) return reported_bboxs