def run_SiamRPN_OPF(seq, rp, bSaveImage): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() config_name = "SiamRPN_ftall" CHECKPOINT = '/home/lab-xiong.jiangfeng/Projects/SiameseRPN/Logs/%s/track_model_checkpoints/%s' % ( config_name, config_name) logging.info('Evaluating {}...'.format(CHECKPOINT)) # Read configurations from json model_config, _, track_config = load_cfgs(CHECKPOINT) track_config['log_level'] = 0 # Skip verbose logging for speed np.random.seed(1234) tf.set_random_seed(1234) g = tf.Graph() with g.as_default(): model = get_model(model_config['Model'])(model_config=model_config, mode='inference') model.build(reuse=tf.AUTO_REUSE) model.online_net = OnlineNet(online_config, is_training=True, reuse=False) model.online_valnet = OnlineNet(online_config, is_training=False, reuse=True) global_variables_init_op = tf.global_variables_initializer() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess_config.gpu_options.per_process_gpu_memory_fraction = 0.2 with tf.Session(graph=g, config=sess_config) as sess: sess.run(global_variables_init_op) model.restore_weights_from_checkpoint(sess, 605000) tracker = OnlineTracker(sess, model, track_config, online_config, show_video=0) tic = time.clock() frames = seq.s_frames init_rect = seq.init_rect x, y, width, height = init_rect # OTB format init_bb = Rectangle(x - 1, y - 1, width, height) trajectory_py = tracker.track(init_bb, frames, bSaveImage, rp) #print(trajectory_py) trajectory = [ Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in trajectory_py ] # x, y add one to match OTB format duration = time.clock() - tic result = dict() result['res'] = trajectory result['type'] = 'rect' result['fps'] = round(seq.len / duration, 3) return result
def run_SA_Siam(seq, rp, bSaveImage, epoch=30): iter_ckpt = epoch * 6650 - 1 checkpoint_appearance_path = CHECKPOINT_APPEARANCE.format( iter_ckpt=iter_ckpt) logging.info('Evaluating {}...'.format(checkpoint_appearance_path)) checkpoint_semantic_path = CHECKPOINT_SEMANTIC.format(iter_ckpt=iter_ckpt) logging.info('Evaluating {}...'.format(checkpoint_semantic_path)) # Read configurations from json model_config, _, track_config = load_cfgs(CHECKPOINT_SA_SIAM) track_config['log_level'] = 0 # Skip verbose logging for speed # Build the inference graph. g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() model.build_model(model_config, track_config) saver_loader_semantic = get_saver('', removes=[':0', '_semantic'], excepts=['appearance', 'State']) saver_loader_appearance = get_saver('', removes=[':0', '_appearance'], excepts=['semantic', 'State']) g.finalize() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: # Load the model from checkpoint. # restore_fn(sess) saver_loader_semantic.restore(sess, checkpoint_semantic_path) saver_loader_appearance.restore(sess, checkpoint_appearance_path) tracker = Tracker(model, model_config, track_config) tic = time.clock() frames = seq.s_frames init_rect = seq.init_rect x, y, width, height = init_rect # OTB format init_bb = Rectangle(x - 1, y - 1, width, height) trajectory_py = tracker.track(sess, init_bb, frames) trajectory = [ Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in trajectory_py ] # x, y add one to match OTB format duration = time.clock() - tic result = dict() result['res'] = trajectory result['type'] = 'rect' result['fps'] = round(seq.len / duration, 3) return result
def run_iSiam_otb(seq, rp, bSaveImage): checkpoint_path = CHECKPOINT logging.info('Evaluating {}...'.format(checkpoint_path)) # Read configurations from json model_config, _, track_config = load_cfgs(checkpoint_path) track_config['log_level'] = 1 # Skip verbose logging for speed track_config['scale_step'] = 1.021 # 1.023*, 1.021* track_config['scale_damp'] = 1. track_config['window_influence'] = 0.21 # 0.21* #track_config['x_image_size'] = 273 # Build the inference graph. g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path) g.finalize() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: # Load the model from checkpoint. restore_fn(sess) tracker = Tracker(model, model_config, track_config) tic = time.time() frames = seq.s_frames init_rect = seq.init_rect x, y, width, height = init_rect # OTB format init_bb = Rectangle(x - 1, y - 1, width, height) #init_bb = Rectangle(x, y, width, height) first_name = frames[0] first_split = first_name.split('/') dir_name = os.path.join( '/home/william/tracker_benchmark/results/samples', first_split[-3]) if not os.path.exists(dir_name): os.mkdir(dir_name) trajectory_py = tracker.track(sess, init_bb, frames, logdir=dir_name) trajectory = [ Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in trajectory_py ] # x, y add one to match OTB format duration = time.time() - tic result = dict() result['res'] = trajectory result['type'] = 'rect' result['fps'] = round(seq.len / duration, 3) return result
def main(checkpoint, input_files): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() model_config, _, track_config = load_cfgs(checkpoint) track_config['log_level'] = 1 track_config["is_video"] = False g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint) g.finalize() if not osp.isdir(track_config['log_dir']): logging.info('Creating inference directory: %s', track_config['log_dir']) mkdir_p(track_config['log_dir']) video_dirs = [] for file_pattern in input_files.split(","): video_dirs.extend(glob(file_pattern)) logging.info("Running tracking on %d videos matching %s", len(video_dirs), input_files) gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: restore_fn(sess) tracker = Tracker(model, model_config=model_config, track_config=track_config) for video_dir in video_dirs: if not osp.isdir(video_dir): logging.warning( '{} is not a directory, skipping...'.format(video_dir)) continue video_name = osp.basename(video_dir) video_log_dir = osp.join(track_config['log_dir'], video_name) mkdir_p(video_log_dir) filenames = sort_nicely(glob(video_dir + '/img/*.jpg')) first_line = open(video_dir + '/groundtruth_rect.txt').readline() bb = [int(v) for v in first_line.strip().split(',')] # Rectangle: [x,y,width,height] init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2], bb[3]) # 0-index in python trajectory = tracker.track(sess, init_bb, filenames, video_log_dir) with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f: for region in trajectory: rect_str = '{},{},{},{}\n'.format(region.x + 1, region.y + 1, region.width, region.height) f.write(rect_str)
def parser_txt_anno(video_dir, video_id, txt_anno, track_save_dir): subfix = ".jpg" if (len(os.listdir(track_save_dir)) == len(os.listdir(video_dir))): return with open(txt_anno, 'r') as f: for index, line in enumerate(f): img_name = str(index) img_file = os.path.join(video_dir, img_name + subfix) #assert os.path.exists(img_file),img_file if not os.path.exists(img_file): continue img = None img = imread(img_file) line_list = line.split(",") bbox = [float(x) for x in line_list] target_box = convert_bbox_format(Rectangle(*bbox), 'center-based') crop, scale, new_sizes = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.5) savename = osp.join( track_save_dir, '{}.w.{}.h.{}.jpg'.format(img_name, int(np.rint(new_sizes[0])), int(np.rint(new_sizes[1])))) if osp.exists(savename): continue imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def __init__(self, model, sess, image, selection, model_config, track_config): selection = Rectangle(selection.x, selection.y, selection.width, selection.height) self.sess = sess self._tracker = Tracker(model, model_config=model_config, track_config=track_config) self._tracker.track_init(sess, selection, image)
def run_MFST(seq, rp, bSaveImage): checkpoint_path = CHECKPOINT logging.info('Evaluating {}...'.format(checkpoint_path)) # Read configurations from json model_config, _, track_config = load_cfgs(checkpoint_path) track_config['log_level'] = 0 # Skip verbose logging for speed # Build the inference graph. g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path) #g.finalize() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: ## used for initializing alexnet parameters init_global = tf.global_variables_initializer() sess.run(init_global) ## global initalizer must be run before restore # Load the model from checkpoint. restore_fn(sess) tracker = Tracker(model, model_config, track_config) tic = time.clock() frames = seq.s_frames init_rect = seq.init_rect x, y, width, height = init_rect # OTB format init_bb = Rectangle(x - 1, y - 1, width, height) trajectory_py = tracker.track(sess, init_bb, frames) trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in trajectory_py] # x, y add one to match OTB format duration = time.clock() - tic result = dict() result['res'] = trajectory result['type'] = 'rect' result['fps'] = round(seq.len / duration, 3) return result
def parser_txt_anno(video_dir, video_id, txt_anno,track_save_dir): if not osp.exists(track_save_dir): os.makedirs(track_save_dir) have_croped_list = [] else: count = count_out_of_view_frame(txt_anno) saved_list = glob(track_save_dir+"/*.jpg") crop_imgs_size = len(saved_list) origin_img_size = len(glob(video_dir+"/*.jpg")) if((crop_imgs_size+count)==origin_img_size): print("video already croped, skip this video") return else: print("crop_imgs_size: %d, origin_img_size: %d, out-of-view: %d"%(crop_imgs_size, origin_img_size, count)) #return have_croped_list = [ i.split('.')[0]+'.jpg' for i in saved_list] img_files = glob(video_dir+"/*.jpg") img_files.sort() with open(txt_anno,'r') as f: for index, line in enumerate(tqdm(f)): if img_files[index] in have_croped_list: print("img %s has been croped, skip"%(img_files[index])) continue img = None img = imread(img_files[index]) if isinstance(img, type(None)): continue line_list = line.split(",") bbox = [int(float(x)) for x in line_list] #skip out-of-view frames if bbox[2]==0 or bbox[3]==0: print("found out-of-view frame, skip this frame") continue #convert from 1-based to 0-based bbox[0] = bbox[0]-1 bbox[1] = bbox[1]-1 target_box = convert_bbox_format(Rectangle(*bbox), 'center-based') #target_box = Rectangle(*bbox) if target_box.width<=0 or target_box.height<=0: print("target_box error in",txt_anno, index) continue crop, scale,new_sizes = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.5) img_id = img_files[index].split('/')[-1].split('.')[0] savename = osp.join(track_save_dir, '{}.w.{}.h.{}.jpg'.format(img_id,int(np.rint(new_sizes[0])),int(np.rint(new_sizes[1])))) #print(savename) if osp.exists(savename): continue imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def run_SiamRPN(seq, rp, bSaveImage): CHECKPOINT = '/home/lab-xiong.jiangfeng/Projects/SiameseRPN/Logs/%s/track_model_checkpoints/%s' % ( tracker_name, tracker_name) logging.info('Evaluating {}...'.format(CHECKPOINT)) # Read configurations from json model_config, _, track_config = load_cfgs(CHECKPOINT) track_config['log_level'] = 0 # Skip verbose logging for speed g = tf.Graph() with g.as_default(): model = SiamRPN(model_config=model_config, mode='inference') model.build(reuse=tf.AUTO_REUSE) global_variables_init_op = tf.global_variables_initializer() gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: sess.run(global_variables_init_op) model.restore_weights_from_checkpoint(sess) tracker = Tracker(sess, model, track_config) tic = time.clock() frames = seq.s_frames init_rect = seq.init_rect x, y, width, height = init_rect # OTB format init_bb = Rectangle(x - 1, y - 1, width, height) trajectory_py = tracker.track(init_bb, frames, bSaveImage, rp) trajectory = [ Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in trajectory_py ] # x, y add one to match OTB format duration = time.clock() - tic result = dict() result['res'] = trajectory result['type'] = 'rect' result['fps'] = round(seq.len / duration, 3) return result
def process_split(root_dir, save_dir, split, subdir='', ): data_dir = osp.join(root_dir, 'Data', 'VID', split) anno_dir = osp.join(root_dir, 'Annotations', 'VID', split, subdir) video_names = os.listdir(anno_dir) for idx, video in enumerate(video_names): print('{split}-{subdir} ({idx}/{total}): Processing {video}...'.format(split=split, subdir=subdir, idx=idx, total=len(video_names), video=video)) video_path = osp.join(anno_dir, video) xml_files = glob(osp.join(video_path, '*.xml')) for xml in xml_files: tree = ET.parse(xml) root = tree.getroot() folder = root.find('folder').text filename = root.find('filename').text # Read image img_file = osp.join(data_dir, folder, filename + '.JPEG') img = None # Get all object bounding boxes bboxs = [] for object in root.iter('object'): bbox = object.find('bndbox') xmax = float(bbox.find('xmax').text) xmin = float(bbox.find('xmin').text) ymax = float(bbox.find('ymax').text) ymin = float(bbox.find('ymin').text) width = xmax - xmin + 1 height = ymax - ymin + 1 bboxs.append([xmin, ymin, width, height]) for idx, object in enumerate(root.iter('object')): id = object.find('trackid').text class_name = object.find('name').text track_save_dir = get_track_save_directory(save_dir, 'train', subdir, video) mkdir_p(track_save_dir) savename = osp.join(track_save_dir, '{}.{:02d}.crop.x.jpg'.format(filename, int(id))) if osp.isfile(savename): continue # skip existing images if img is None: img = imread(img_file) # Get crop target_box = convert_bbox_format(Rectangle(*bboxs[idx]), 'center-based') crop, _ = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.01) imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def process_split(root_dir, save_dir, split): data_dir = osp.join(root_dir, split) video_names = os.listdir(data_dir) video_names = [vn for vn in video_names if '.txt' not in vn] for idx, video in enumerate(video_names): print('{split} ({idx}/{total}): Processing {video}...'.format(split=split, idx=idx, total=len(video_names), video=video)) video_path = osp.join(data_dir, video) jpg_files = glob(osp.join(video_path, '*.jpg')) with open(osp.join(video_path, 'groundtruth.txt')) as f: ann_content = f.readlines() for jpg in jpg_files: # Read image img_file = jpg.split('/')[-1] img = None # Get all object bounding boxes jpgidx = img_file.split('.')[0] jpgidx = int(jpgidx) - 1 ann = ann_content[jpgidx] ann = ann.strip() bbox = ann.split(',') bbox = [int(float(bb)) for bb in bbox] # [xmin, ymin, w, h] track_save_dir = osp.join(save_dir, split, video) mkdir_p(track_save_dir) savename = osp.join(track_save_dir, '{}.crop.x.jpg'.format(img_file)) if osp.isfile(savename): try: im = Image.open(savename) continue # skip existing images except IOError: os.remove(savename) if img is None: img = imread(jpg) # Get crop target_box = convert_bbox_format(Rectangle(*bbox), 'center-based') crop, _ = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.5) imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def init(self, sess, frame, first_bbox, logdir='/tmp'): # Get initial target bounding box and convert to center based self.i = 0 first_bbox = Rectangle(first_bbox[0], first_bbox[1], first_bbox[2], first_bbox[3]) bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [ frame, bbox_feed, self.x_image_size_init, self.search_factors_init ] frame2crop_scale, self.image_z = self.siamese_model.initialize( sess, input_feed) imwrite(osp.join(logdir, 'aimagez.jpg'), cv2.cvtColor(self.image_z, cv2.COLOR_RGB2BGR)) # Storing target state self.original_target_height = bbox.height self.original_target_width = bbox.width self.search_center = np.array([ get_center(self.x_image_size_init), get_center(self.x_image_size_init) ]) self.current_target_state = TargetState( bbox=bbox, search_pos=self.search_center, scale_idx=int(get_center(self.num_scales))) self.store_thresh = 0.9 self.conf_thresh = 0.7 self.bound_thresh = 0.5 self.sup_thresh = 0.1 self.mem_count = 0 self.update_delay = 0 self.lost = 0 self.x_image_size = self.x_image_size_init self.image_c = None self.moved2border = False self.prev_score = self.conf_thresh + 0.01 return True
def parser_xml_anno(img_file, xml_anno, track_save_dir): tree = ET.parse(xml_anno) root = tree.getroot() img = None # Get all object bounding boxes bboxs = [] for object in root.iter('object'): bbox = object.find('bndbox') xmax = float(bbox.find('xmax').text) xmin = float(bbox.find('xmin').text) ymax = float(bbox.find('ymax').text) ymin = float(bbox.find('ymin').text) width = xmax - xmin + 1 height = ymax - ymin + 1 bboxs.append([xmin, ymin, width, height]) for idx, object in enumerate(root.iter('object')): #id = object.find('trackid').text if img is None: img = cv2.imread(img_file) target_box = convert_bbox_format(Rectangle(*bboxs[idx]), 'center-based') crop, scale, new_sizes = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.5) index_sub = "_" + str(idx) if idx > 0 else "" save_dir = track_save_dir + index_sub if not os.path.exists(save_dir): os.makedirs(save_dir) savename = os.path.join( save_dir, '0.w.{}.h.{}.jpg'.format(int(np.rint(new_sizes[0])), int(np.rint(new_sizes[1])))) if osp.exists(savename): continue imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def track(self, sess, frame): bbox_feed = [ self.current_target_state.bbox.y, self.current_target_state.bbox.x, self.current_target_state.bbox.height, self.current_target_state.bbox.width ] input_feed = [frame, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = self.current_target_state.bbox.y x = self.current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = self.current_target_state.bbox.height / self.original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = self.original_target_height * target_scale width = self.original_target_width * target_scale self.current_target_state.bbox = Rectangle(x, y, width, height) self.current_target_state.scale_idx = best_scale self.current_target_state.search_pos = self.search_center + disp_instance_input assert 0 <= self.current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= self.current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' reported_bbox = convert_bbox_format(self.current_target_state.bbox, 'top-left-based') self.frame_cnt += 1 if self.log_level > 0: np.save(osp.join(self.logdir, 'num_frames.npy'), [self.frame_cnt]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype( np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor cv2.imwrite( osp.join(self.logdir, 'image_cropped{}.jpg'.format(self.frame_cnt)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) cv2.imwrite( osp.join(self.logdir, 'image_origin{}.jpg'.format(self.frame_cnt)), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) np.save( osp.join(self.logdir, 'best_scale{}.npy'.format(self.frame_cnt)), [best_scale]) np.save( osp.join(self.logdir, 'response{}.npy'.format(self.frame_cnt)), response) y_search, x_search = self.current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(self.logdir, 'bbox{}.npy'.format(self.frame_cnt)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) with open(osp.join(self.logdir, 'track_rect.txt'), 'a') as f: rect_str = '{},{},{},{}\n'.format(int(reported_bbox[0]), int(reported_bbox[1]), int(reported_bbox[2]), int(reported_bbox[3])) f.write(rect_str) return reported_bbox
def track_vot(self, sess, frame): bbox_feed = [ self.vot_current_target_state.bbox.y, self.vot_current_target_state.bbox.x, self.vot_current_target_state.bbox.height, self.vot_current_target_state.bbox.width ] input_feed = [frame, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response_s_c5 = outputs['response_s_c5'] response_s_c4 = outputs['response_s_c4'] response_s_c3 = outputs['response_s_c3'] response_a_c5 = outputs['response_a_c5'] response_a_c4 = outputs['response_a_c4'] response_a_c3 = outputs['response_a_c3'] response_size = response_s_c5.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_a_c5_max = np.max(response_a_c5) response_a_c4_max = np.max(response_a_c4) response_a_c3_max = np.max(response_a_c3) response_a_c5 = response_a_c5 / response_a_c5_max response_a_c4 = response_a_c4 / response_a_c4_max response_a_c3 = response_a_c3 / response_a_c3_max response_s_all = 0.7 * response_s_c5 + 0.3 * response_s_c4 + 0.1 * response_s_c3 response_a_all = 0.3 * response_a_c5 + 0.6 * response_a_c4 + 0.1 * response_a_c3 response_s_all_max = np.max(response_s_all) response_s_all = response_s_all / response_s_all_max response_a_all_max = np.max(response_a_all) response_a_all = response_a_all / response_a_all_max response = 0.3 * response_s_all + 0.7 * response_a_all response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: ## TODO combine siamfc and alexnet best_scale = 0 response = response[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * 8 #self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = self.vot_current_target_state.bbox.y x = self.vot_current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = self.vot_current_target_state.bbox.height / self.vot_original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = self.vot_original_target_height * target_scale width = self.vot_original_target_width * target_scale self.vot_current_target_state.bbox = Rectangle(x, y, width, height) self.vot_current_target_state.scale_idx = best_scale self.vot_current_target_state.search_pos = self.vot_search_center + disp_instance_input reported_bbox = convert_bbox_format(self.vot_current_target_state.bbox, 'top-left-based') return reported_bbox
def main(_): # load model model_config, _, track_config = load_cfgs(CHECKPOINT) track_config["log_level"] = 0 track_config["is_video"] = True g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, CHECKPOINT) g.finalize() if not os.path.isdir(track_config['log_dir']): tf.logging.info('Creating inference directory: %s', track_config['log_dir']) mkdir_p(track_config['log_dir']) gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: restore_fn(sess) tracker = Tracker(model, model_config=model_config, track_config=track_config) video_name = os.path.basename(FLAGS.video_path) video_log_dir = os.path.join(track_config["log_dir"], video_name) mkdir_p(video_log_dir) if str(FLAGS.video_path) in ["0", "1"]: # read from camera video_path = int(FLAGS.video_path) with_camera = True else: # read from video video_path = glob(os.path.join(FLAGS.video_path, "*.mp4"))[0] with_camera = False video_capture = cv2.VideoCapture(video_path) bb = [-1, -1, -1, -1] cv2.namedWindow("template") cv2.setMouseCallback("template", draw_init_box, bb) trajectory = [] f_count = 0 f_rate = 0 start_time = time.time() while True: # capture frame by frame ret_, frame = video_capture.read() if ret_ == False: continue f_width, f_height = [ int(a) for a in FLAGS.video_resolution.split("*") ] try: o_frame = cv2.resize(frame, (f_width, f_height), interpolation=cv2.INTER_CUBIC) except: break i_frame = cv2.cvtColor(o_frame, cv2.COLOR_BGR2RGB) # cv2.imwrite("test.jpg",o_frame) # pdb.set_trace() if f_count == 0: # initialize the tracker # wait for drawing init box while True: init_frame = o_frame.copy() cv2.imshow("template", init_frame) k = cv2.waitKey(0) if k == 32: # space cx = int((bb[0] + bb[2]) / 2) cy = int((bb[1] + bb[3]) / 2) w = int(bb[2] - bb[0]) h = int(bb[3] - bb[1]) # Rectangle: [x,y,width,height] init_bb = Rectangle(cx - 1, cy - 1, w, h) # 0-index in python draw_box(init_frame, init_bb, "exemplar") break first_box = convert_bbox_format(init_bb, "center-based") bbox_feed = [ first_box.y, first_box.x, first_box.height, first_box.width ] input_feed = [i_frame, bbox_feed] frame2crop_scale = tracker.siamese_model.initialize( sess, input_feed) # Storing target state original_target_height = first_box.height original_target_width = first_box.width search_center = np.array([ get_center(tracker.x_image_size), get_center(tracker.x_image_size) ]) current_target_state = TargetState( bbox=first_box, search_pos=search_center, scale_idx=int(get_center(tracker.num_scales))) # setup initialized params current_param = { "original_target_width": original_target_width, "original_target_height": original_target_height, "search_center": search_center, "current_target_state": current_target_state } bbox, current_param = tracker.track_frame(sess, i_frame, current_param, video_log_dir) # add overlays end_time = time.time() f_rate = int(1 / (end_time - start_time)) start_time = time.time() draw_box(o_frame, bbox) cv2.putText(o_frame, str(f_rate) + "fps", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), thickness=2, lineType=2) trajectory.append(bbox) f_count += 1 cv2.imshow("Real-time Ouput", o_frame) cv2.imshow("template", init_frame) # if f_count > 30: # cv2.imwrite("test.jpg",o_frame) # pdb.set_trace() if cv2.waitKey(1) & 0xFF == ord("q"): cv2.imwrite("./assets/instance.jpg", o_frame) cv2.imwrite("./assets/exemplar.jpg", init_frame) break video_capture.release() cv2.destroyAllWindows() # save track results # pdb.set_trace() with open(os.path.join(video_log_dir, "track_rect.txt"), "w") as f: for region in trajectory: rect_str = "{},{},{},{}\n".format(region.x + 1, region.y + 1, region.width, region.height) f.write(rect_str)
def process_split(root_dir, save_dir, split): data_dir = osp.join(root_dir, split) video_names = os.listdir(data_dir) video_names = [vn for vn in video_names if '.txt' not in vn] for idx, video in enumerate(video_names): print('{split} ({idx}/{total}): Processing {video}...'.format( split=split, idx=idx, total=len(video_names), video=video)) video_path = osp.join(data_dir, video) jpg_files = glob(osp.join(video_path, '*.jpg')) with open(osp.join(video_path, 'groundtruth.txt')) as f: ann_content = f.readlines() track_save_dir = osp.join(save_dir, split, video) mkdir_p(track_save_dir) fw = open(osp.join(track_save_dir, 'groundtruth.txt'), 'w') copyfile(osp.join(video_path, 'absence.label'), osp.join(track_save_dir, 'absence.label')) copyfile(osp.join(video_path, 'cover.label'), osp.join(track_save_dir, 'cover.label')) copyfile(osp.join(video_path, 'cut_by_image.label'), osp.join(track_save_dir, 'cut_by_image.label')) copyfile(osp.join(video_path, 'meta_info.ini'), osp.join(track_save_dir, 'meta_info.ini')) for i, jpg in enumerate(jpg_files): # Read image img_file = jpg.split('/')[-1] img = None # Get all object bounding boxes jpgidx = img_file.split('.')[0] jpgidx = int(jpgidx) - 1 ann = ann_content[jpgidx] ann = ann.strip() bbox = ann.split(',') bbox = [int(float(bb)) for bb in bbox] # [xmin, ymin, w, h] ## bbox #### annk = ann_content[i] annk = annk.strip() bboxk = annk.split(',') bboxk = [int(float(bb)) for bb in bboxk] # [xmin, ymin, w, h] w = bboxk[2] h = bboxk[3] context_amount = 0.5 size_z = 127 size_x = 271 wc_z = w + context_amount * (w + h) hc_z = h + context_amount * (w + h) s_z = np.sqrt(wc_z * hc_z) scale_z = size_z / s_z d_search = (size_x - size_z) / 2 pad = d_search / scale_z s_x = s_z + 2 * pad wn = int(w * size_x / s_x) hn = int(h * size_x / s_x) #if wn < 1 or hn < 1: #if wn == 0: #wn = 1 #if hn == 0: #hn = 1 #ratio = wn / hn #if ratio > 1.: #newbb = [int(135-wn/2), int(135-hn/2), 85, int(85. / ratio)] #else: #newbb = [int(135-wn/2), int(135-hn/2), int(85. * ratio), 85] #else: #newbb = [int(135-wn/2), int(135-hn/2), wn, hn] if wn < 1: wn = 1 if hn < 1: hn = 1 newbb = [int(135 - wn / 2), int(135 - hn / 2), wn, hn] fw.write(','.join(str(e) + '.0000' for e in newbb) + '\n') ## bbox #### savename = osp.join(track_save_dir, '{}.jpg'.format(img_file)) if osp.isfile(savename): try: im = Image.open(savename) continue # skip existing images except IOError: os.remove(savename) if img is None: img = imread(jpg) # Get crop target_box = convert_bbox_format(Rectangle(*bbox), 'center-based') crop, _ = get_crops(img, target_box, size_z=127, size_x=271, context_amount=0.5) imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90]) fw.close()
def track(self, sess, frame, logdir='/tmp'): """Runs tracking on a single image.""" i = self.i = self.i + 1 current_target_state = self.current_target_state original_target_height = self.original_target_height original_target_width = self.original_target_width search_center = self.search_center mem_count = self.mem_count moved2border = self.moved2border update_delay = self.update_delay + 1 lost = self.lost + 1 image_c = self.image_c x_image_size = self.x_image_size search_factors = self.search_factors_init conf_thresh = self.conf_thresh bound_thresh = self.bound_thresh sup_thresh = self.sup_thresh prev_score = self.prev_score hi, wi, _ = frame.shape h_ratio = current_target_state.bbox.height / hi w_ratio = current_target_state.bbox.width / wi t_i_ratio = max([h_ratio, w_ratio]) if prev_score < conf_thresh: x_image_size += 100 #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init) if t_i_ratio < 0.05: x_image_size = min(x_image_size, 555) elif t_i_ratio > 0.6: x_image_size = min(x_image_size, 255) elif t_i_ratio > 0.4: x_image_size = min(x_image_size, 355) else: x_image_size = min(x_image_size, 455) else: x_image_size = self.x_image_size_init num_scales = len(search_factors) bbx = current_target_state.bbox.x bby = current_target_state.bbox.y bbw = current_target_state.bbox.width bbh = current_target_state.bbox.height bbox_feed = [bby, bbx, bbh, bbw] if i > 1: top = (current_target_state.bbox.y - (current_target_state.bbox.height / 2) < 10) left = (current_target_state.bbox.x - (current_target_state.bbox.width / 2) < 10) bottom = (current_target_state.bbox.y + (current_target_state.bbox.height / 2) > hi - 10) right = (current_target_state.bbox.x + (current_target_state.bbox.width / 2) > wi - 10) if top or left or bottom or right: if not prev_score < bound_thresh: moved2border = True if not moved2border: current_target_state.bbox = Rectangle( wi / 2, hi / 2, current_target_state.bbox.width, current_target_state.bbox.height) bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] else: if not prev_score < bound_thresh: moved2border = False if t_i_ratio < 0.3 and lost > 5: lost = 0 diffy = hi * 0.5 - bbox_feed[0] diffx = wi * 0.5 - bbox_feed[1] bbox_feed = [ diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3] ] current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0], bbox_feed[3], bbox_feed[2]) input_feed = [frame, bbox_feed, x_image_size, search_factors] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) if np.max(re_out) < conf_thresh: x_image_sizeb4 = x_image_size x_image_size += 100 #x_image_size_l = ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init if t_i_ratio < 0.05: x_image_size_l = 555 elif t_i_ratio > 0.6: x_image_size_l = 255 elif t_i_ratio > 0.4: x_image_size_l = 355 else: x_image_size_l = 455 if not x_image_size > x_image_size_l: input_feed = [frame, bbox_feed, x_image_size, search_factors] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) else: x_image_size = x_image_sizeb4 # Choose the scale whole response map has the highest peak if num_scales > 1: response_max = np.max(response * (re_out > sup_thresh), axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (num_scales)) current_scale_idx = int(get_center(num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties if max(response_penalized) == 0.: best_scale = 1 else: best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] re_out = re_out[best_scale] with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) response = response * (re_out > sup_thresh) window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = ( 1 - window_influence) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) prev_score = re_out[r_max, c_max] # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] y = np.round(y) x = np.round(x) # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) # Some book keeping search_center = np.array( [get_center(x_image_size), get_center(x_image_size)]) height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype( np.uint8) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') xmin = bbox_search.x.astype(np.int32) ymin = bbox_search.y.astype(np.int32) xmax = xmin + bbox_search.width.astype(np.int32) ymax = ymin + bbox_search.height.astype(np.int32) cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2) text = str(prev_score) cv2.putText(image_cropped, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), lineType=cv2.LINE_AA) imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) if prev_score > self.store_thresh: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] self.siamese_model.update_mem(sess, [ frame, bbox_feed, self.x_image_size_init, self.search_factors_init, mem_count ]) mem_count += 1 if mem_count > 4 or (mem_count > 0 and update_delay > 5): self.siamese_model.update(sess) mem_count = 0 update_delay = 0 if prev_score > bound_thresh: lost = 0 self.mem_count = mem_count self.update_delay = update_delay self.moved2border = moved2border self.lost = lost self.x_image_size = x_image_size self.prev_score = prev_score self.current_target_state = current_target_state reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') #return prev_score>0.4, reported_bbox, prev_score return prev_score > 0.4, reported_bbox
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [ frames[0], bbox_feed, self.x_image_size, self.search_factors ] frame2crop_scale, image_z = self.siamese_model.initialize( sess, input_feed) imwrite(osp.join(logdir, 'aimagez.jpg'), cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR)) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] image_c = None x_image_size = self.x_image_size lost = 0 moved2border = False conf_thresh = 0.2 # 0.2 bound_thresh = 0.2 # 0.2 sup_thresh = 0.15 # 0.15 prev_score = conf_thresh + 0.01 upsample_factor = self.track_config['upsample_factor'] search_factors = self.search_factors for i, filename in enumerate(frames): if i > 0 or include_first: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] if prev_score > bound_thresh: lost = 0 else: lost += 1 if prev_score > 0.9: self.siamese_model.update(sess, [ frames[i - 1], bbox_feed, self.x_image_size, search_factors ]) with open(filename, 'rb') as f: wi, hi = GetWidthAndHeight(f) t_i_ratio = max([ current_target_state.bbox.height / hi, current_target_state.bbox.width / wi ]) if prev_score < conf_thresh: x_image_size += 100 #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init) if t_i_ratio < 0.05: x_image_size = min(x_image_size, 555) elif t_i_ratio < 0.25: x_image_size = min(x_image_size, 455) elif t_i_ratio > 0.5: x_image_size = min(x_image_size, 255) else: x_image_size = min(x_image_size, 355) else: x_image_size = self.x_image_size if i > 1: top = (current_target_state.bbox.y - (current_target_state.bbox.height / 2) < 10) left = (current_target_state.bbox.x - (current_target_state.bbox.width / 2) < 10) bottom = (current_target_state.bbox.y + (current_target_state.bbox.height / 2) > hi - 10) right = (current_target_state.bbox.x + (current_target_state.bbox.width / 2) > wi - 10) bound_flag = top or left or bottom or right #if top or left or bottom or right: #if not prev_score < bound_thresh: #moved2border = True #if not moved2border: #current_target_state.bbox = Rectangle(wi / 2, hi / 2, #current_target_state.bbox.width, #current_target_state.bbox.height) #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, #current_target_state.bbox.height, current_target_state.bbox.width] #else: #if not prev_score < bound_thresh: #moved2border = False if lost > 5 and bound_flag: lost = 0 diffy = hi * 0.5 - bbox_feed[0] diffx = wi * 0.5 - bbox_feed[1] bbox_feed = [ diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3] ] current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0], bbox_feed[3], bbox_feed[2]) input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5: x_image_sizeb4 = x_image_size x_image_size += 100 #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init if t_i_ratio < 0.05: x_image_size_l = 555 elif t_i_ratio < 0.25: x_image_size_l = 455 elif t_i_ratio > 0.5: x_image_size_l = 255 else: x_image_size_l = 355 if not x_image_size > x_image_size_l: input_feed = [ filename, bbox_feed, x_image_size, search_factors ] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] re_out = np.around(1 / (1 + np.exp(-response)), 2) else: x_image_size = x_image_sizeb4 # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response * (re_out > sup_thresh), axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties if max(response_penalized) == 0.: best_scale = 1 else: best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] re_out = re_out[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) response = response * (re_out > sup_thresh) window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window if np.max(re_out) < sup_thresh: r_max, c_max = response.shape r_max, c_max = int(r_max / 2), int(c_max / 2) disp_instance_input = [0, 0] disp_instance_frame = [0, 0] else: # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] y = np.round(y) x = np.round(x) prev_score = re_out[r_max, c_max] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) # Some book keeping search_center = np.array( [get_center(x_image_size), get_center(x_image_size)]) height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') # Add score colormap image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) #im_shape = image_cropped.shape #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride'] #pad = int((im_shape[0] - re_shape) / 2) #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad]) #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant') #response_crop = response_crop / response_crop.max() #response_crop = np.uint8(response_crop * 255) #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB) #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0) xmin = bbox_search.x.astype(np.int32) ymin = bbox_search.y.astype(np.int32) xmax = xmin + bbox_search.width.astype(np.int32) ymax = ymin + bbox_search.height.astype(np.int32) cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax), (255, 0, 0), 2) text = str(prev_score) cv2.putText(image_cropped, text, (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), lineType=cv2.LINE_AA) imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) #if image_c is not None: #his_dir = logdir + '_his' #if not osp.exists(his_dir): #os.mkdir(his_dir) #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0] #image_c_p = np.uint8(image_c_p) #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)), #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR)) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') print(frames) # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array([get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int(get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] output_json={} #dump all bboxes in this output file for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step(sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] #print(response) with np.errstate(all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot(np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence) * response + window_influence * self.window # Find maximum response srtd=response.argsort(axis=None) v = response.argmax() r_max, c_max = np.unravel_index(v, response.shape) if not osp.exists(osp.join(logdir,"Intermediate")): os.mkdir(osp.join(logdir,"Intermediate")) to_save = np.interp(response,(response.min(),response.max()),(0,255)) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save) to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1) ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1) image = np.uint8(thresh1.copy()) cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB) image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2) cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image) centres=[] for c in cnts: M = cv2.moments(c) cX = int(M["m10"] / M["m00"]) cY = int(M["m01"] / M["m00"]) centres.append((cY,cX,False)) centres.append((r_max,c_max,True)) #print(centres) #cts_copy = copy(current_target_state) #cts_copy2 = copy(current_target_state) output_json[filename]=[] for (r_max,c_max,to_deep_copy) in centres: if to_deep_copy: cts_copy = deepcopy(current_target_state) else: cts_copy = copy(current_target_state) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[best_scale] # Position within frame in frame coordinates y = cts_copy.bbox.y x = cts_copy.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = cts_copy.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config['scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale cts_copy.bbox = Rectangle(x, y, width, height) cts_copy.scale_idx = best_scale cts_copy.search_pos = search_center + disp_instance_input assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0 and to_deep_copy: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = cts_copy.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height]) reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based') #print(f"reported bbox {reported_bbox}") if to_deep_copy: reported_bboxs.append(reported_bbox) else: rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1, reported_bbox.width, reported_bbox.height) arr = output_json[filename] arr.append(rect_str) with open(osp.join(logdir,'bboxes.json'),'w') as f: json.dump(output_json,f,indent=4) return reported_bboxs
def set_first_frame(self, frame, r): first_line = "{},{},{},{}".format(r[0], r[1], r[2], r[3]) bb = [int(v) for v in first_line.strip().split(',')] init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2], bb[3]) # 0-index in python self.tracker.initialize(self.sess, init_bb, frame, self.video_log_dir)
def track(self, first_bbox, frames, bSaveImage=False, SavePath='/tmp'): #1. init the tracker self.track_init(first_bbox, frames[0]) include_first = self.track_config['include_first'] # Run tracking loop reported_bboxs = [] examplar = np.reshape(self.first_image_examplar, [1, self.z_image_size, self.z_image_size, 3]) cost_time_dict = { 'load_img': 0.0, 'crop_img': 0.0, 'sess_run': 0.0, 'post_process': 0.0 } for i, filename in tqdm(enumerate(frames)): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. load_img_start = time.time() bgr_img = safe_imread(filename) load_img_end = time.time() cost_time_dict['load_img'] += load_img_end - load_img_start crop_img_start = time.time() current_img = cv2.cvtColor( bgr_img, cv2.COLOR_BGR2RGB) if self.image_use_rgb else bgr_img instance_img, scale_x, _ = get_crops( current_img, self.current_target_state.search_box, self.z_image_size, self.x_image_size, 0.5) instance = np.reshape( instance_img, [1, self.x_image_size, self.x_image_size, 3]) crop_img_end = time.time() cost_time_dict['crop_img'] += crop_img_end - crop_img_start sess_run_start = time.time() if self.model.model_config.get('BinWindow', False): boxes, scores = self.sess.run( [self.model.topk_bboxes, self.model.topk_scores], feed_dict={ self.model.examplar_feed: examplar, self.model.instance_feed: instance, self.model.gt_examplar_boxes: self.gt_examplar_boxes }) else: boxes, scores = self.sess.run( [self.model.topk_bboxes, self.model.topk_scores], feed_dict={ self.model.examplar_feed: examplar, self.model.instance_feed: instance }) sess_run_end = time.time() cost_time_dict['sess_run'] += sess_run_end - sess_run_start post_process_start = time.time() def padded_size(w, h): context = 0.5 * (w + h) return np.sqrt((w + context) * (h + context)) #boxes: 1*NA*4 score: 1*Na boxes = boxes[0] #NA*4 scores = scores[0] #NA*2 scales = padded_size( (boxes[:, 2] - boxes[:, 0]) / scale_x, (boxes[:, 3] - boxes[:, 1]) / scale_x) #Na ratios = (boxes[:, 3] - boxes[:, 1]) / (boxes[:, 2] - boxes[:, 0]) scale_change = scales / self.current_target_state.scale scale_change = np.maximum(scale_change, 1.0 / scale_change) ratio_change = ratios / (self.current_target_state.ratio) ratio_change = np.maximum(ratio_change, 1.0 / ratio_change) scale_penalty = np.exp(-(scale_change * ratio_change - 1) * self.track_config['penalty_k']) pscores = scores * scale_penalty window_influence = self.track_config['window_influence'] wpscores = pscores * ( 1 - window_influence) + self.window * window_influence max_index = np.argmax(wpscores) corrdinates = boxes[max_index] #Top1 #print("Tracking %d/%d with tracking score:%.2f, wpscore: %.2f"%(i+1, len(frames), scores[max_index],wpscores[max_index])) # Position within frame in frame coordinates res_box = Rectangle(*corrdinate_to_bbox(corrdinates)) center_x = (self.x_image_size - 1.0) / 2 center_y = center_x delta_x = (res_box.x - center_x) / scale_x delta_y = (res_box.y - center_y) / scale_x w = res_box.width / scale_x h = res_box.height / scale_x y = self.current_target_state.target_box.y + delta_y x = self.current_target_state.target_box.x + delta_x #update seach bbox alpha = self.track_config[ 'search_scale_smooth_factor'] * pscores[max_index] belta = 0.0 new_search_cx = max( min( self.img_width, self.current_target_state.target_box.x * belta + (1.0 - belta) * x), 0.0) new_search_cy = max( min( self.img_height, self.current_target_state.target_box.y * belta + (1.0 - belta) * y), 0.0) new_search_w = max( 10.0, min( self.current_target_state.target_box.width * (1.0 - alpha) + alpha * w, self.img_width)) new_search_h = max( 10.0, min( self.current_target_state.target_box.height * (1.0 - alpha) + alpha * h, self.img_height)) self.current_target_state.target_box = Rectangle( new_search_cx, new_search_cy, new_search_w, new_search_h) self.current_target_state.scale = padded_size( new_search_w, new_search_h) self.current_target_state.ratio = new_search_h * 1.0 / new_search_w #auto increase the search region if max score is lower than the conf_threshold if (scores[max_index] < self.conf_threshold and self.auto_increase): increase_w = min(new_search_w * 1.5, self.img_width) increase_h = min(new_search_h * 1.5, self.img_height) self.current_target_state.search_box = Rectangle( new_search_cx, new_search_cy, increase_w, increase_h) else: self.current_target_state.search_box = self.current_target_state.target_box #save and show tracking process if bSaveImage: cv2.imwrite(SavePath + "/" + os.path.basename(frames[i]), bgr_img) elif self.save_video: x1, y1, x2, y2 = bbox_to_corrdinate( self.current_target_state.search_box) cv2.rectangle(bgr_img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) cv2.putText(bgr_img, "%.2f" % (scores[max_index]), (int(x1), int(y1)), 0, 1, (0, 255, 0), 2) self.video.write(bgr_img) elif self.show_video: x1, y1, x2, y2 = bbox_to_corrdinate( self.current_target_state.search_box) cv2.rectangle(bgr_img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2) cv2.putText(bgr_img, "%.2f" % (scores[max_index]), (int(x1), int(y1)), 0, 1, (0, 255, 0), 2) cv2.imshow("Tracker", bgr_img) cv2.waitKey(10) else: pass post_process_end = time.time() cost_time_dict[ 'post_process'] += post_process_end - post_process_start else: x1, y1, x2, y2 = bbox_to_corrdinate( self.current_target_state.search_box) cv2.rectangle(self.first_frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 255, 255), 2) #cv2.imshow("Tracker",cv2.cvtColor(self.first_frame_image, cv2.COLOR_RGB2BGR)) #cv2.imshow("Target",self.first_frame_image) #cv2.waitKey(100) reported_bbox = convert_bbox_format( self.current_target_state.target_box, 'top-left-based') reported_bboxs.append(reported_bbox) for key in cost_time_dict: cost_time_dict[key] /= len(frames) #print(cost_time_dict) return reported_bboxs
av1 = tf.all_variables() tracker = Tracker(model, model_config=model_config, track_config=track_config) for video_dir in video_dirs: if not osp.isdir(video_dir): logging.warning('{} is not a directory, skipping...'.format(video_dir)) continue video_name = osp.basename(video_dir) video_log_dir = "tmp" mkdir_p(video_log_dir) filenames = sort_nicely(glob(video_dir + '/img/*.jpg')) first_line = open(video_dir + '/groundtruth_rect.txt').readline() bb = [int(v) for v in first_line.strip().replace(' ', ',').replace('\t', ',').split(',')] init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2], bb[3]) # 0-index in python print("######{0},{1}".format(video_dir, len(filenames))) # ============================================================================= # for i in range(10): # print ("fixed classid: {}".format(i)) # trajectory = tracker.track(sess, init_bb, filenames, video_log_dir, str(i)) # with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f: # for region in trajectory: # rect_str = '{},{},{},{}\n'.format(region.x + 1, region.y + 1, # region.width, region.height) # f.write(rect_str) # # gt_bboxs = readbbox(osp.join(input_files, 'groundtruth_rect.txt')) # pred_bboxs = readbbox(osp.join(video_log_dir, 'track_rect.txt')) # print ("MulSiamFC class --- {0} IOU --- {1}".format(str(i), cal_IOU(pred_bboxs, gt_bboxs)))
def track_init(self, first_bbox, first_frame_image_path): print(first_frame_image_path) first_frame_image = safe_imread(first_frame_image_path) self.first_frame_image = cv2.cvtColor( first_frame_image, cv2.COLOR_BGR2RGB) if self.image_use_rgb else first_frame_image self.first_bbox = convert_bbox_format( Rectangle(first_bbox[0], first_bbox[1], first_bbox[2], first_bbox[3]), 'center-based') first_image_crop, _, target_size = get_crops(self.first_frame_image, self.first_bbox, self.z_image_size, self.x_image_size, 0.5) cx = (self.x_image_size - 1) / 2.0 cy = (self.x_image_size - 1) / 2.0 gt_examplar_box = np.array([ cx - target_size[0] / 2.0, cy - target_size[1] / 2.0, cx + target_size[0] / 2.0, cy + target_size[1] / 2.0 ], np.float32) self.img_height, self.img_width, _ = self.first_frame_image.shape if self.save_video: video_name = first_frame_image_path.split('/')[-3] + '.mp4' fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') result_dir = os.path.join(Project_root, self.track_config['log_dir']) if not os.path.exists(result_dir): os.makedirs(result_dir) video_path = os.path.join(result_dir, video_name) print("save video into %s" % (video_path)) self.video = cv2.VideoWriter(video_path, fourcc, 30, (self.img_width, self.img_height)) def center_crop(img, crop_size=127): img_shape = np.shape(img) center_y = (img_shape[0] - 1) // 2 center_x = (img_shape[1] - 1) // 2 h = crop_size w = crop_size croped_img = img[center_y - h // 2:center_y + h // 2 + 1, center_x - w // 2:center_x + w // 2 + 1] assert (croped_img.shape[0] == crop_size) return croped_img self.first_image_examplar = center_crop(first_image_crop, self.z_image_size) shift_y = (self.x_image_size - self.z_image_size) // 2 shift_x = shift_y x1 = gt_examplar_box[0] - shift_x y1 = gt_examplar_box[1] - shift_y x2 = gt_examplar_box[2] - shift_x y2 = gt_examplar_box[3] - shift_y self.gt_examplar_boxes = np.reshape(np.array([x1, y1, x2, y2]), [1, 4]) self.current_target_state = TargetState(bbox=self.first_bbox) self.window = np.tile( np.outer(np.hanning(self.score_size), np.hanning(self.score_size)).flatten(), 5) #5 is the number of aspect ratio anchors
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') smooth_rate = self.track_config['smooth'] update_interval = self.track_config['update_interval'] feature_balance = self.track_config['feature_balance'] # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) examplar = self.siamese_model.get_examplar(sess, input_feed) examplar_smooth = examplar st_template = [] for i in range(self.siamese_model.train_config['time_range']): st_template.append(examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step(sess, st_template_np) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Set padding for refining search region img = mpimg.imread(frames[0]) context_amount = self.track_config['context_amount'] size_z = self.model_config['z_image_size'] size_x = self.track_config['x_image_size'] padding_h = 10 padding_w = 10 if original_target_height / original_target_width > 2: #2 padding_h = 1.4 #1.4 padding_w = 6 # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response2 = outputs['response2'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response2, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) else: best_scale = 0 response = response[best_scale] response2 = response2[best_scale] response = feature_balance * response + ( 1 - feature_balance) * response2 with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Refine the response base_z_size = np.array([ current_target_state.bbox.height, current_target_state.bbox.width ]) base_z_context_size = base_z_size + context_amount * np.sum( base_z_size) base_s_z = np.sqrt( np.prod(base_z_context_size)) # Canonical size base_scale_z = size_z / base_s_z d_search = (size_x - size_z) / 2.0 base_pad = d_search / base_scale_z base_s_x = base_s_z + 2 * base_pad if base_s_x / current_target_state.bbox.height > padding_h: start_h = np.ceil( response_size * (base_s_x - current_target_state.bbox.height * padding_h) / (2 * base_s_x)) end_h = np.floor(response_size - start_h) start_h = np.int(start_h) end_h = np.int(end_h) response[0:start_h, :] = 0 response[end_h:-1, :] = 0 if base_s_x / current_target_state.bbox.width > padding_w: start_w = np.ceil( response_size * (base_s_x - current_target_state.bbox.width * padding_w) / (2 * base_s_x)) end_w = np.floor(response_size - start_w) start_w = np.int(start_w) end_w = np.int(end_w) response[:, :start_w] = 0 response[:, end_w:] = 0 # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input # Update the spatial-temporal template using gcn if i % update_interval == 0: bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] current_examplar = self.siamese_model.get_examplar( sess, input_feed) # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:] examplar_smooth = current_examplar current_examplar = smooth_rate * examplar_smooth + ( 1 - smooth_rate) * examplar st_template.pop(1) st_template.append(current_examplar) st_template_np = np.array(st_template) self.siamese_model.update_st_template_step( sess, st_template_np) assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def track(self, sess, first_bbox, frames, logdir='/tmp'): """Runs tracking on a single image sequence.""" # Get initial target bounding box and convert to center based bbox = convert_bbox_format(first_bbox, 'center-based') # Feed in the first frame image to set initial state. bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width] input_feed = [frames[0], bbox_feed] frame2crop_scale = self.siamese_model.initialize(sess, input_feed) # Storing target state original_target_height = bbox.height original_target_width = bbox.width search_center = np.array( [get_center(self.x_image_size), get_center(self.x_image_size)]) current_target_state = TargetState(bbox=bbox, search_pos=search_center, scale_idx=int( get_center(self.num_scales))) include_first = get(self.track_config, 'include_first', False) logging.info('Tracking include first -- {}'.format(include_first)) # Run tracking loop reported_bboxs = [] for i, filename in enumerate(frames): if i > 0 or include_first: # We don't really want to process the first image unless intended to do so. bbox_feed = [ current_target_state.bbox.y, current_target_state.bbox.x, current_target_state.bbox.height, current_target_state.bbox.width ] input_feed = [filename, bbox_feed] outputs, metadata = self.siamese_model.inference_step( sess, input_feed) search_scale_list = outputs['scale_xs'] response = outputs['response'] response_size = response.shape[1] # Choose the scale whole response map has the highest peak if self.num_scales > 1: response_max = np.max(response, axis=(1, 2)) penalties = self.track_config['scale_penalty'] * np.ones( (self.num_scales)) current_scale_idx = int(get_center(self.num_scales)) penalties[current_scale_idx] = 1.0 response_penalized = response_max * penalties best_scale = np.argmax(response_penalized) if np.max(response_max) < 0: logging.warning('MAX_RESPONSE LESS THAN ZERO!') # best_scale = current_scale_idx else: best_scale = 0 response = response[best_scale] with np.errstate( all='raise'): # Raise error if something goes wrong response = response - np.min(response) response = response / np.sum(response) if self.window is None: window = np.dot( np.expand_dims(np.hanning(response_size), 1), np.expand_dims(np.hanning(response_size), 0)) self.window = window / np.sum(window) # normalize window window_influence = self.track_config['window_influence'] response = (1 - window_influence ) * response + window_influence * self.window # Find maximum response r_max, c_max = np.unravel_index(response.argmax(), response.shape) # Convert from crop-relative coordinates to frame coordinates p_coor = np.array([r_max, c_max]) # displacement from the center in instance final representation ... disp_instance_final = p_coor - get_center(response_size) # ... in instance feature space ... upsample_factor = self.track_config['upsample_factor'] disp_instance_feat = disp_instance_final / upsample_factor # ... Avoid empty position ... r_radius = int(response_size / upsample_factor / 2) disp_instance_feat = np.maximum( np.minimum(disp_instance_feat, r_radius), -r_radius) # ... in instance input ... disp_instance_input = disp_instance_feat * self.model_config[ 'embed_config']['stride'] # ... in instance original crop (in frame coordinates) disp_instance_frame = disp_instance_input / search_scale_list[ best_scale] # Position within frame in frame coordinates y = current_target_state.bbox.y x = current_target_state.bbox.x y += disp_instance_frame[0] x += disp_instance_frame[1] # Target scale damping and saturation target_scale = current_target_state.bbox.height / original_target_height search_factor = self.search_factors[best_scale] scale_damp = self.track_config[ 'scale_damp'] # damping factor for scale update target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor) target_scale = np.maximum(0.2, np.minimum(5.0, target_scale)) # Some book keeping height = original_target_height * target_scale width = original_target_width * target_scale current_target_state.bbox = Rectangle(x, y, width, height) current_target_state.scale_idx = best_scale current_target_state.search_pos = search_center + disp_instance_input assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \ 'target position in feature space should be no larger than input image size' if self.log_level > 0: np.save(osp.join(logdir, 'num_frames.npy'), [i + 1]) # Select the image with the highest score scale and convert it to uint8 image_cropped = outputs['image_cropped'][ best_scale].astype(np.uint8) # Note that imwrite in cv2 assumes the image is in BGR format. # However, the cropped image returned by TensorFlow is RGB. # Therefore, we convert color format using cv2.cvtColor imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)), cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR)) np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale]) np.save(osp.join(logdir, 'response{}.npy'.format(i)), response) y_search, x_search = current_target_state.search_pos search_scale = search_scale_list[best_scale] target_height_search = height * search_scale target_width_search = width * search_scale bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search) bbox_search = convert_bbox_format(bbox_search, 'top-left-based') np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [ bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height ]) reported_bbox = convert_bbox_format(current_target_state.bbox, 'top-left-based') reported_bboxs.append(reported_bbox) return reported_bboxs
def main(checkpoint, input_files): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() model_config, _, track_config = load_cfgs(checkpoint) track_config['log_level'] = 1 g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint) g.finalize() if not osp.isdir(track_config['log_dir']): logging.info('Creating inference directory: %s', track_config['log_dir']) mkdir_p(track_config['log_dir']) video_dirs = [] for file_pattern in input_files.split(","): video_dirs.extend(glob(file_pattern)) logging.info("Running tracking on %d videos matching %s", len(video_dirs), input_files) gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(graph=g, config=sess_config) as sess: restore_fn(sess) tracker = Tracker(model, model_config=model_config, track_config=track_config) for video_dir in video_dirs: if not osp.isdir(video_dir): logging.warning( '{} is not a directory, skipping...'.format(video_dir)) continue video_name = osp.basename(video_dir) video_log_dir = osp.join(track_config['log_dir'], video_name) mkdir_p(video_log_dir) filenames = sort_nicely( glob(video_dir + '/img/*.jpg') + glob(video_dir + '/img/*.png')) first_line = open(video_dir + '/groundtruth_rect.txt').readline() bb = [int(v) for v in first_line.strip().split(',')] init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2], bb[3]) # 0-index in python trajectory = tracker.track(sess, init_bb, filenames, video_log_dir) with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f: for region in trajectory: rect_str = '{},{},{},{}\n'.format(region.x + 1, region.y + 1, region.width, region.height) f.write(rect_str) with open(osp.join(video_log_dir, 'bboxes.json'), 'r') as f: data = json.load(f) final_output = {} for i, fname in enumerate(data.keys()): img = np.array(Image.open(fname).convert('RGB')) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #print(img,img.shape) bboxes = data[fname] bboxes = list( map( lambda x: list( map(lambda y: float(y), x.strip().split(','))), bboxes)) arr = [] for x, y, w, h in bboxes: ymin, xmin, ymax, xmax = int(y), int(x), int(y + h), int(x + w) img = cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0, 255), 2) arr.append([ymin, xmin, ymax, xmax]) final_output[fname] = arr name = osp.basename(fname) name = osp.splitext(name)[0] W, H, _ = img.shape cv2.imshow("Pic", cv2.resize(img, (W // 2, H // 2))) cv2.waitKey(0) out_folder = osp.join(video_log_dir, "Outputs") mkdir_p(out_folder) cv2.imwrite(osp.join(out_folder, f"{name}_bbox.png"), img) with open(osp.join(out_folder, "output.json"), "w") as f: json.dump(final_output, f, indent=4) cv2.destroyAllWindows()