Beispiel #1
0
def run_SiamRPN_OPF(seq, rp, bSaveImage):
    os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()
    config_name = "SiamRPN_ftall"
    CHECKPOINT = '/home/lab-xiong.jiangfeng/Projects/SiameseRPN/Logs/%s/track_model_checkpoints/%s' % (
        config_name, config_name)
    logging.info('Evaluating {}...'.format(CHECKPOINT))

    # Read configurations from json
    model_config, _, track_config = load_cfgs(CHECKPOINT)
    track_config['log_level'] = 0  # Skip verbose logging for speed

    np.random.seed(1234)
    tf.set_random_seed(1234)
    g = tf.Graph()

    with g.as_default():
        model = get_model(model_config['Model'])(model_config=model_config,
                                                 mode='inference')
        model.build(reuse=tf.AUTO_REUSE)
        model.online_net = OnlineNet(online_config,
                                     is_training=True,
                                     reuse=False)
        model.online_valnet = OnlineNet(online_config,
                                        is_training=False,
                                        reuse=True)
        global_variables_init_op = tf.global_variables_initializer()

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)
    sess_config.gpu_options.per_process_gpu_memory_fraction = 0.2

    with tf.Session(graph=g, config=sess_config) as sess:
        sess.run(global_variables_init_op)
        model.restore_weights_from_checkpoint(sess, 605000)
        tracker = OnlineTracker(sess,
                                model,
                                track_config,
                                online_config,
                                show_video=0)

        tic = time.clock()
        frames = seq.s_frames
        init_rect = seq.init_rect
        x, y, width, height = init_rect  # OTB format
        init_bb = Rectangle(x - 1, y - 1, width, height)
        trajectory_py = tracker.track(init_bb, frames, bSaveImage, rp)
        #print(trajectory_py)
        trajectory = [
            Rectangle(val.x + 1, val.y + 1, val.width, val.height)
            for val in trajectory_py
        ]  # x, y add one to match OTB format
        duration = time.clock() - tic

        result = dict()
        result['res'] = trajectory
        result['type'] = 'rect'
        result['fps'] = round(seq.len / duration, 3)
    return result
Beispiel #2
0
def run_SA_Siam(seq, rp, bSaveImage, epoch=30):
    iter_ckpt = epoch * 6650 - 1
    checkpoint_appearance_path = CHECKPOINT_APPEARANCE.format(
        iter_ckpt=iter_ckpt)
    logging.info('Evaluating {}...'.format(checkpoint_appearance_path))
    checkpoint_semantic_path = CHECKPOINT_SEMANTIC.format(iter_ckpt=iter_ckpt)
    logging.info('Evaluating {}...'.format(checkpoint_semantic_path))

    # Read configurations from json
    model_config, _, track_config = load_cfgs(CHECKPOINT_SA_SIAM)

    track_config['log_level'] = 0  # Skip verbose logging for speed

    # Build the inference graph.
    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        model.build_model(model_config, track_config)
        saver_loader_semantic = get_saver('',
                                          removes=[':0', '_semantic'],
                                          excepts=['appearance', 'State'])
        saver_loader_appearance = get_saver('',
                                            removes=[':0', '_appearance'],
                                            excepts=['semantic', 'State'])
    g.finalize()

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(graph=g, config=sess_config) as sess:
        # Load the model from checkpoint.
        # restore_fn(sess)
        saver_loader_semantic.restore(sess, checkpoint_semantic_path)
        saver_loader_appearance.restore(sess, checkpoint_appearance_path)

        tracker = Tracker(model, model_config, track_config)

        tic = time.clock()
        frames = seq.s_frames
        init_rect = seq.init_rect
        x, y, width, height = init_rect  # OTB format
        init_bb = Rectangle(x - 1, y - 1, width, height)

        trajectory_py = tracker.track(sess, init_bb, frames)
        trajectory = [
            Rectangle(val.x + 1, val.y + 1, val.width, val.height)
            for val in trajectory_py
        ]  # x, y add one to match OTB format
        duration = time.clock() - tic

        result = dict()
        result['res'] = trajectory
        result['type'] = 'rect'
        result['fps'] = round(seq.len / duration, 3)
        return result
Beispiel #3
0
def run_iSiam_otb(seq, rp, bSaveImage):
    checkpoint_path = CHECKPOINT
    logging.info('Evaluating {}...'.format(checkpoint_path))

    # Read configurations from json
    model_config, _, track_config = load_cfgs(checkpoint_path)

    track_config['log_level'] = 1  # Skip verbose logging for speed
    track_config['scale_step'] = 1.021  # 1.023*, 1.021*
    track_config['scale_damp'] = 1.
    track_config['window_influence'] = 0.21  # 0.21*
    #track_config['x_image_size'] = 273

    # Build the inference graph.
    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(model_config, track_config,
                                                   checkpoint_path)
    g.finalize()

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(graph=g, config=sess_config) as sess:
        # Load the model from checkpoint.
        restore_fn(sess)
        tracker = Tracker(model, model_config, track_config)
        tic = time.time()
        frames = seq.s_frames
        init_rect = seq.init_rect
        x, y, width, height = init_rect  # OTB format
        init_bb = Rectangle(x - 1, y - 1, width, height)
        #init_bb = Rectangle(x, y, width, height)

        first_name = frames[0]
        first_split = first_name.split('/')
        dir_name = os.path.join(
            '/home/william/tracker_benchmark/results/samples', first_split[-3])
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

        trajectory_py = tracker.track(sess, init_bb, frames, logdir=dir_name)
        trajectory = [
            Rectangle(val.x + 1, val.y + 1, val.width, val.height)
            for val in trajectory_py
        ]  # x, y add one to match OTB format
        duration = time.time() - tic

        result = dict()
        result['res'] = trajectory
        result['type'] = 'rect'
        result['fps'] = round(seq.len / duration, 3)
        return result
def main(checkpoint, input_files):
    os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()

    model_config, _, track_config = load_cfgs(checkpoint)
    track_config['log_level'] = 1
    track_config["is_video"] = False

    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(model_config, track_config,
                                                   checkpoint)
    g.finalize()

    if not osp.isdir(track_config['log_dir']):
        logging.info('Creating inference directory: %s',
                     track_config['log_dir'])
        mkdir_p(track_config['log_dir'])

    video_dirs = []
    for file_pattern in input_files.split(","):
        video_dirs.extend(glob(file_pattern))
    logging.info("Running tracking on %d videos matching %s", len(video_dirs),
                 input_files)

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(graph=g, config=sess_config) as sess:
        restore_fn(sess)

        tracker = Tracker(model,
                          model_config=model_config,
                          track_config=track_config)

        for video_dir in video_dirs:
            if not osp.isdir(video_dir):
                logging.warning(
                    '{} is not a directory, skipping...'.format(video_dir))
                continue

            video_name = osp.basename(video_dir)
            video_log_dir = osp.join(track_config['log_dir'], video_name)
            mkdir_p(video_log_dir)

            filenames = sort_nicely(glob(video_dir + '/img/*.jpg'))
            first_line = open(video_dir + '/groundtruth_rect.txt').readline()
            bb = [int(v) for v in first_line.strip().split(',')]
            # Rectangle: [x,y,width,height]
            init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2],
                                bb[3])  # 0-index in python

            trajectory = tracker.track(sess, init_bb, filenames, video_log_dir)
            with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f:
                for region in trajectory:
                    rect_str = '{},{},{},{}\n'.format(region.x + 1,
                                                      region.y + 1,
                                                      region.width,
                                                      region.height)
                    f.write(rect_str)
Beispiel #5
0
def parser_txt_anno(video_dir, video_id, txt_anno, track_save_dir):
    subfix = ".jpg"

    if (len(os.listdir(track_save_dir)) == len(os.listdir(video_dir))): return

    with open(txt_anno, 'r') as f:
        for index, line in enumerate(f):
            img_name = str(index)
            img_file = os.path.join(video_dir, img_name + subfix)
            #assert os.path.exists(img_file),img_file
            if not os.path.exists(img_file):
                continue

            img = None
            img = imread(img_file)

            line_list = line.split(",")

            bbox = [float(x) for x in line_list]

            target_box = convert_bbox_format(Rectangle(*bbox), 'center-based')
            crop, scale, new_sizes = get_crops(img,
                                               target_box,
                                               size_z=127,
                                               size_x=255,
                                               context_amount=0.5)

            savename = osp.join(
                track_save_dir,
                '{}.w.{}.h.{}.jpg'.format(img_name, int(np.rint(new_sizes[0])),
                                          int(np.rint(new_sizes[1]))))
            if osp.exists(savename):
                continue
            imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
Beispiel #6
0
    def __init__(self, model, sess, image, selection, model_config, track_config):

        selection = Rectangle(selection.x, selection.y, selection.width, selection.height)

        self.sess = sess
        self._tracker = Tracker(model, model_config=model_config, track_config=track_config)
        self._tracker.track_init(sess, selection, image)
Beispiel #7
0
def run_MFST(seq, rp, bSaveImage):
  checkpoint_path = CHECKPOINT
  logging.info('Evaluating {}...'.format(checkpoint_path))

  # Read configurations from json
  model_config, _, track_config = load_cfgs(checkpoint_path)

  track_config['log_level'] = 0  # Skip verbose logging for speed

  # Build the inference graph.
  g = tf.Graph()
  with g.as_default():
    model = inference_wrapper.InferenceWrapper()
    restore_fn = model.build_graph_from_config(model_config, track_config, checkpoint_path)
  #g.finalize()

  gpu_options = tf.GPUOptions(allow_growth=True)
  sess_config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(graph=g, config=sess_config) as sess:
    ## used for initializing alexnet parameters
    init_global = tf.global_variables_initializer()
    sess.run(init_global)
    
    ## global initalizer must be run before restore
    # Load the model from checkpoint.
    restore_fn(sess)

    tracker = Tracker(model, model_config, track_config)

    tic = time.clock()
    frames = seq.s_frames
    init_rect = seq.init_rect
    x, y, width, height = init_rect  # OTB format
    init_bb = Rectangle(x - 1, y - 1, width, height)

    trajectory_py = tracker.track(sess, init_bb, frames)
    trajectory = [Rectangle(val.x + 1, val.y + 1, val.width, val.height) for val in
                  trajectory_py]  # x, y add one to match OTB format
    duration = time.clock() - tic

    result = dict()
    result['res'] = trajectory
    result['type'] = 'rect'
    result['fps'] = round(seq.len / duration, 3)
    return result
def parser_txt_anno(video_dir, video_id, txt_anno,track_save_dir):
  if not osp.exists(track_save_dir):
    os.makedirs(track_save_dir)
    have_croped_list = []
  else:
    count = count_out_of_view_frame(txt_anno)
    saved_list = glob(track_save_dir+"/*.jpg")
    crop_imgs_size = len(saved_list)
    origin_img_size = len(glob(video_dir+"/*.jpg"))
    if((crop_imgs_size+count)==origin_img_size):
      print("video already croped, skip this video")
      return
    else:
      print("crop_imgs_size: %d, origin_img_size: %d, out-of-view: %d"%(crop_imgs_size, origin_img_size, count))
    #return
    have_croped_list = [ i.split('.')[0]+'.jpg' for i in saved_list]

  img_files = glob(video_dir+"/*.jpg")
  img_files.sort()

  with open(txt_anno,'r') as f:
    for index, line in enumerate(tqdm(f)):
      if img_files[index] in have_croped_list:
        print("img %s has been croped, skip"%(img_files[index]))
        continue
      img = None
      img = imread(img_files[index])
      if isinstance(img, type(None)):
        continue

      line_list = line.split(",")
      bbox = [int(float(x)) for x in line_list]

      #skip out-of-view frames
      if bbox[2]==0 or bbox[3]==0:
        print("found out-of-view frame, skip this frame")
        continue

      #convert from 1-based to 0-based
      bbox[0] = bbox[0]-1
      bbox[1] = bbox[1]-1

      target_box = convert_bbox_format(Rectangle(*bbox), 'center-based')
      #target_box = Rectangle(*bbox)
      if target_box.width<=0 or target_box.height<=0:
        print("target_box error in",txt_anno, index)
        continue 

      crop, scale,new_sizes = get_crops(img, target_box,
                            size_z=127, size_x=255,
                            context_amount=0.5)
      img_id = img_files[index].split('/')[-1].split('.')[0]
      savename = osp.join(track_save_dir, '{}.w.{}.h.{}.jpg'.format(img_id,int(np.rint(new_sizes[0])),int(np.rint(new_sizes[1]))))
      #print(savename)
      if osp.exists(savename):
        continue
      imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
Beispiel #9
0
def run_SiamRPN(seq, rp, bSaveImage):
    CHECKPOINT = '/home/lab-xiong.jiangfeng/Projects/SiameseRPN/Logs/%s/track_model_checkpoints/%s' % (
        tracker_name, tracker_name)
    logging.info('Evaluating {}...'.format(CHECKPOINT))
    # Read configurations from json
    model_config, _, track_config = load_cfgs(CHECKPOINT)
    track_config['log_level'] = 0  # Skip verbose logging for speed

    g = tf.Graph()
    with g.as_default():
        model = SiamRPN(model_config=model_config, mode='inference')
        model.build(reuse=tf.AUTO_REUSE)
        global_variables_init_op = tf.global_variables_initializer()

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(graph=g, config=sess_config) as sess:
        sess.run(global_variables_init_op)
        model.restore_weights_from_checkpoint(sess)
        tracker = Tracker(sess, model, track_config)

        tic = time.clock()
        frames = seq.s_frames
        init_rect = seq.init_rect
        x, y, width, height = init_rect  # OTB format
        init_bb = Rectangle(x - 1, y - 1, width, height)

        trajectory_py = tracker.track(init_bb, frames, bSaveImage, rp)

        trajectory = [
            Rectangle(val.x + 1, val.y + 1, val.width, val.height)
            for val in trajectory_py
        ]  # x, y add one to match OTB format
        duration = time.clock() - tic

        result = dict()
        result['res'] = trajectory
        result['type'] = 'rect'
        result['fps'] = round(seq.len / duration, 3)
    return result
def process_split(root_dir, save_dir, split, subdir='', ):
  data_dir = osp.join(root_dir, 'Data', 'VID', split)
  anno_dir = osp.join(root_dir, 'Annotations', 'VID', split, subdir)
  video_names = os.listdir(anno_dir)

  for idx, video in enumerate(video_names):
    print('{split}-{subdir} ({idx}/{total}): Processing {video}...'.format(split=split, subdir=subdir,
                                                                           idx=idx, total=len(video_names),
                                                                           video=video))
    video_path = osp.join(anno_dir, video)
    xml_files = glob(osp.join(video_path, '*.xml'))

    for xml in xml_files:
      tree = ET.parse(xml)
      root = tree.getroot()

      folder = root.find('folder').text
      filename = root.find('filename').text

      # Read image
      img_file = osp.join(data_dir, folder, filename + '.JPEG')
      img = None

      # Get all object bounding boxes
      bboxs = []
      for object in root.iter('object'):
        bbox = object.find('bndbox')
        xmax = float(bbox.find('xmax').text)
        xmin = float(bbox.find('xmin').text)
        ymax = float(bbox.find('ymax').text)
        ymin = float(bbox.find('ymin').text)
        width = xmax - xmin + 1
        height = ymax - ymin + 1
        bboxs.append([xmin, ymin, width, height])

      for idx, object in enumerate(root.iter('object')):
        id = object.find('trackid').text
        class_name = object.find('name').text

        track_save_dir = get_track_save_directory(save_dir, 'train', subdir, video)
        mkdir_p(track_save_dir)
        savename = osp.join(track_save_dir, '{}.{:02d}.crop.x.jpg'.format(filename, int(id)))
        if osp.isfile(savename): continue  # skip existing images

        if img is None:
          img = imread(img_file)

        # Get crop
        target_box = convert_bbox_format(Rectangle(*bboxs[idx]), 'center-based')
        crop, _ = get_crops(img, target_box, size_z=127, size_x=255, context_amount=0.01)

        imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
def process_split(root_dir, save_dir, split):
  data_dir = osp.join(root_dir, split)
  video_names = os.listdir(data_dir)
  video_names = [vn for vn in video_names if '.txt' not in vn]

  for idx, video in enumerate(video_names):
    print('{split} ({idx}/{total}): Processing {video}...'.format(split=split,
                                                                  idx=idx, total=len(video_names),
                                                                  video=video))
    video_path = osp.join(data_dir, video)
    jpg_files = glob(osp.join(video_path, '*.jpg'))
    
    with open(osp.join(video_path, 'groundtruth.txt')) as f:
      ann_content = f.readlines()

    for jpg in jpg_files:
      # Read image
      img_file = jpg.split('/')[-1]
      img = None

      # Get all object bounding boxes
      jpgidx = img_file.split('.')[0]
      jpgidx = int(jpgidx) - 1
      ann = ann_content[jpgidx]
      ann = ann.strip()
      bbox = ann.split(',')
      bbox = [int(float(bb)) for bb in bbox]  # [xmin, ymin, w, h]

      track_save_dir = osp.join(save_dir, split, video)
      mkdir_p(track_save_dir)
      savename = osp.join(track_save_dir, '{}.crop.x.jpg'.format(img_file))
      
      if osp.isfile(savename): 
        try:
          im = Image.open(savename)
          continue  # skip existing images
        except IOError:
          os.remove(savename)
          
      if img is None:
        img = imread(jpg)
        
      # Get crop
      target_box = convert_bbox_format(Rectangle(*bbox), 'center-based')
      crop, _ = get_crops(img, target_box,
                          size_z=127, size_x=255,
                          context_amount=0.5)
      imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
Beispiel #12
0
    def init(self, sess, frame, first_bbox, logdir='/tmp'):
        # Get initial target bounding box and convert to center based
        self.i = 0
        first_bbox = Rectangle(first_bbox[0], first_bbox[1], first_bbox[2],
                               first_bbox[3])
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [
            frame, bbox_feed, self.x_image_size_init, self.search_factors_init
        ]
        frame2crop_scale, self.image_z = self.siamese_model.initialize(
            sess, input_feed)
        imwrite(osp.join(logdir, 'aimagez.jpg'),
                cv2.cvtColor(self.image_z, cv2.COLOR_RGB2BGR))

        # Storing target state
        self.original_target_height = bbox.height
        self.original_target_width = bbox.width
        self.search_center = np.array([
            get_center(self.x_image_size_init),
            get_center(self.x_image_size_init)
        ])
        self.current_target_state = TargetState(
            bbox=bbox,
            search_pos=self.search_center,
            scale_idx=int(get_center(self.num_scales)))

        self.store_thresh = 0.9
        self.conf_thresh = 0.7
        self.bound_thresh = 0.5
        self.sup_thresh = 0.1

        self.mem_count = 0
        self.update_delay = 0
        self.lost = 0
        self.x_image_size = self.x_image_size_init
        self.image_c = None
        self.moved2border = False
        self.prev_score = self.conf_thresh + 0.01
        return True
Beispiel #13
0
def parser_xml_anno(img_file, xml_anno, track_save_dir):
    tree = ET.parse(xml_anno)
    root = tree.getroot()

    img = None

    # Get all object bounding boxes
    bboxs = []
    for object in root.iter('object'):
        bbox = object.find('bndbox')
        xmax = float(bbox.find('xmax').text)
        xmin = float(bbox.find('xmin').text)
        ymax = float(bbox.find('ymax').text)
        ymin = float(bbox.find('ymin').text)
        width = xmax - xmin + 1
        height = ymax - ymin + 1
        bboxs.append([xmin, ymin, width, height])

    for idx, object in enumerate(root.iter('object')):
        #id = object.find('trackid').text
        if img is None:
            img = cv2.imread(img_file)
        target_box = convert_bbox_format(Rectangle(*bboxs[idx]),
                                         'center-based')

        crop, scale, new_sizes = get_crops(img,
                                           target_box,
                                           size_z=127,
                                           size_x=255,
                                           context_amount=0.5)

        index_sub = "_" + str(idx) if idx > 0 else ""
        save_dir = track_save_dir + index_sub
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        savename = os.path.join(
            save_dir, '0.w.{}.h.{}.jpg'.format(int(np.rint(new_sizes[0])),
                                               int(np.rint(new_sizes[1]))))
        if osp.exists(savename):
            continue
        imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
Beispiel #14
0
    def track(self, sess, frame):
        bbox_feed = [
            self.current_target_state.bbox.y, self.current_target_state.bbox.x,
            self.current_target_state.bbox.height,
            self.current_target_state.bbox.width
        ]
        input_feed = [frame, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
            response_max = np.max(response, axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (self.num_scales))
            current_scale_idx = int(get_center(self.num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            best_scale = np.argmax(response_penalized)
        else:
            best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)

        if self.window is None:
            window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                            np.expand_dims(np.hanning(response_size), 0))
            self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config[
            'embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = self.current_target_state.bbox.y
        x = self.current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = self.current_target_state.bbox.height / self.original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = self.original_target_height * target_scale
        width = self.original_target_width * target_scale
        self.current_target_state.bbox = Rectangle(x, y, width, height)
        self.current_target_state.scale_idx = best_scale
        self.current_target_state.search_pos = self.search_center + disp_instance_input

        assert 0 <= self.current_target_state.search_pos[0] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'
        assert 0 <= self.current_target_state.search_pos[1] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'

        reported_bbox = convert_bbox_format(self.current_target_state.bbox,
                                            'top-left-based')

        self.frame_cnt += 1
        if self.log_level > 0:
            np.save(osp.join(self.logdir, 'num_frames.npy'), [self.frame_cnt])

            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(
                np.uint8)
            # Note that imwrite in cv2 assumes the image is in BGR format.
            # However, the cropped image returned by TensorFlow is RGB.
            # Therefore, we convert color format using cv2.cvtColor
            cv2.imwrite(
                osp.join(self.logdir,
                         'image_cropped{}.jpg'.format(self.frame_cnt)),
                cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))
            cv2.imwrite(
                osp.join(self.logdir,
                         'image_origin{}.jpg'.format(self.frame_cnt)),
                cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

            np.save(
                osp.join(self.logdir,
                         'best_scale{}.npy'.format(self.frame_cnt)),
                [best_scale])
            np.save(
                osp.join(self.logdir, 'response{}.npy'.format(self.frame_cnt)),
                response)

            y_search, x_search = self.current_target_state.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search,
                                    target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
            np.save(osp.join(self.logdir, 'bbox{}.npy'.format(self.frame_cnt)),
                    [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])
            with open(osp.join(self.logdir, 'track_rect.txt'), 'a') as f:
                rect_str = '{},{},{},{}\n'.format(int(reported_bbox[0]),
                                                  int(reported_bbox[1]),
                                                  int(reported_bbox[2]),
                                                  int(reported_bbox[3]))
                f.write(rect_str)

        return reported_bbox
Beispiel #15
0
    def track_vot(self, sess, frame):
        bbox_feed = [
            self.vot_current_target_state.bbox.y,
            self.vot_current_target_state.bbox.x,
            self.vot_current_target_state.bbox.height,
            self.vot_current_target_state.bbox.width
        ]
        input_feed = [frame, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response_s_c5 = outputs['response_s_c5']
        response_s_c4 = outputs['response_s_c4']
        response_s_c3 = outputs['response_s_c3']
        response_a_c5 = outputs['response_a_c5']
        response_a_c4 = outputs['response_a_c4']
        response_a_c3 = outputs['response_a_c3']
        response_size = response_s_c5.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:

            response_a_c5_max = np.max(response_a_c5)
            response_a_c4_max = np.max(response_a_c4)
            response_a_c3_max = np.max(response_a_c3)
            response_a_c5 = response_a_c5 / response_a_c5_max
            response_a_c4 = response_a_c4 / response_a_c4_max
            response_a_c3 = response_a_c3 / response_a_c3_max

            response_s_all = 0.7 * response_s_c5 + 0.3 * response_s_c4 + 0.1 * response_s_c3

            response_a_all = 0.3 * response_a_c5 + 0.6 * response_a_c4 + 0.1 * response_a_c3

            response_s_all_max = np.max(response_s_all)
            response_s_all = response_s_all / response_s_all_max

            response_a_all_max = np.max(response_a_all)
            response_a_all = response_a_all / response_a_all_max
            response = 0.3 * response_s_all + 0.7 * response_a_all

            response_max = np.max(response, axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (self.num_scales))
            current_scale_idx = int(get_center(self.num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            best_scale = np.argmax(response_penalized)
        else:
            ## TODO combine siamfc and alexnet
            best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)

        if self.window is None:
            window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                            np.expand_dims(np.hanning(response_size), 0))
            self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * 8  #self.model_config['embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = self.vot_current_target_state.bbox.y
        x = self.vot_current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # Target scale damping and saturation
        target_scale = self.vot_current_target_state.bbox.height / self.vot_original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

        # Some book keeping
        height = self.vot_original_target_height * target_scale
        width = self.vot_original_target_width * target_scale
        self.vot_current_target_state.bbox = Rectangle(x, y, width, height)
        self.vot_current_target_state.scale_idx = best_scale
        self.vot_current_target_state.search_pos = self.vot_search_center + disp_instance_input

        reported_bbox = convert_bbox_format(self.vot_current_target_state.bbox,
                                            'top-left-based')
        return reported_bbox
def main(_):
    # load model
    model_config, _, track_config = load_cfgs(CHECKPOINT)
    track_config["log_level"] = 0
    track_config["is_video"] = True

    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(model_config, track_config,
                                                   CHECKPOINT)
    g.finalize()

    if not os.path.isdir(track_config['log_dir']):
        tf.logging.info('Creating inference directory: %s',
                        track_config['log_dir'])
        mkdir_p(track_config['log_dir'])

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)
    with tf.Session(graph=g, config=sess_config) as sess:
        restore_fn(sess)
        tracker = Tracker(model,
                          model_config=model_config,
                          track_config=track_config)
        video_name = os.path.basename(FLAGS.video_path)
        video_log_dir = os.path.join(track_config["log_dir"], video_name)
        mkdir_p(video_log_dir)

        if str(FLAGS.video_path) in ["0", "1"]:
            # read from camera
            video_path = int(FLAGS.video_path)
            with_camera = True
        else:
            # read from video
            video_path = glob(os.path.join(FLAGS.video_path, "*.mp4"))[0]
            with_camera = False

        video_capture = cv2.VideoCapture(video_path)

        bb = [-1, -1, -1, -1]
        cv2.namedWindow("template")
        cv2.setMouseCallback("template", draw_init_box, bb)

        trajectory = []
        f_count = 0
        f_rate = 0
        start_time = time.time()
        while True:
            # capture frame by frame
            ret_, frame = video_capture.read()
            if ret_ == False:
                continue
            f_width, f_height = [
                int(a) for a in FLAGS.video_resolution.split("*")
            ]
            try:
                o_frame = cv2.resize(frame, (f_width, f_height),
                                     interpolation=cv2.INTER_CUBIC)
            except:
                break
            i_frame = cv2.cvtColor(o_frame, cv2.COLOR_BGR2RGB)

            # cv2.imwrite("test.jpg",o_frame)
            # pdb.set_trace()

            if f_count == 0:  # initialize the tracker
                # wait for drawing init box
                while True:
                    init_frame = o_frame.copy()
                    cv2.imshow("template", init_frame)
                    k = cv2.waitKey(0)
                    if k == 32:  # space
                        cx = int((bb[0] + bb[2]) / 2)
                        cy = int((bb[1] + bb[3]) / 2)
                        w = int(bb[2] - bb[0])
                        h = int(bb[3] - bb[1])
                        # Rectangle: [x,y,width,height]
                        init_bb = Rectangle(cx - 1, cy - 1, w,
                                            h)  # 0-index in python
                        draw_box(init_frame, init_bb, "exemplar")
                        break

                first_box = convert_bbox_format(init_bb, "center-based")
                bbox_feed = [
                    first_box.y, first_box.x, first_box.height, first_box.width
                ]
                input_feed = [i_frame, bbox_feed]
                frame2crop_scale = tracker.siamese_model.initialize(
                    sess, input_feed)
                # Storing target state
                original_target_height = first_box.height
                original_target_width = first_box.width
                search_center = np.array([
                    get_center(tracker.x_image_size),
                    get_center(tracker.x_image_size)
                ])
                current_target_state = TargetState(
                    bbox=first_box,
                    search_pos=search_center,
                    scale_idx=int(get_center(tracker.num_scales)))
                # setup initialized params
                current_param = {
                    "original_target_width": original_target_width,
                    "original_target_height": original_target_height,
                    "search_center": search_center,
                    "current_target_state": current_target_state
                }

            bbox, current_param = tracker.track_frame(sess, i_frame,
                                                      current_param,
                                                      video_log_dir)
            # add overlays
            end_time = time.time()
            f_rate = int(1 / (end_time - start_time))
            start_time = time.time()
            draw_box(o_frame, bbox)
            cv2.putText(o_frame,
                        str(f_rate) + "fps", (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1, (0, 0, 255),
                        thickness=2,
                        lineType=2)

            trajectory.append(bbox)
            f_count += 1

            cv2.imshow("Real-time Ouput", o_frame)
            cv2.imshow("template", init_frame)
            # if f_count > 30:
            #     cv2.imwrite("test.jpg",o_frame)
            #     pdb.set_trace()
            if cv2.waitKey(1) & 0xFF == ord("q"):
                cv2.imwrite("./assets/instance.jpg", o_frame)
                cv2.imwrite("./assets/exemplar.jpg", init_frame)
                break

        video_capture.release()
        cv2.destroyAllWindows()

        # save track results
        # pdb.set_trace()
        with open(os.path.join(video_log_dir, "track_rect.txt"), "w") as f:
            for region in trajectory:
                rect_str = "{},{},{},{}\n".format(region.x + 1, region.y + 1,
                                                  region.width, region.height)
                f.write(rect_str)
def process_split(root_dir, save_dir, split):
    data_dir = osp.join(root_dir, split)
    video_names = os.listdir(data_dir)
    video_names = [vn for vn in video_names if '.txt' not in vn]

    for idx, video in enumerate(video_names):
        print('{split} ({idx}/{total}): Processing {video}...'.format(
            split=split, idx=idx, total=len(video_names), video=video))
        video_path = osp.join(data_dir, video)
        jpg_files = glob(osp.join(video_path, '*.jpg'))

        with open(osp.join(video_path, 'groundtruth.txt')) as f:
            ann_content = f.readlines()

        track_save_dir = osp.join(save_dir, split, video)
        mkdir_p(track_save_dir)
        fw = open(osp.join(track_save_dir, 'groundtruth.txt'), 'w')

        copyfile(osp.join(video_path, 'absence.label'),
                 osp.join(track_save_dir, 'absence.label'))
        copyfile(osp.join(video_path, 'cover.label'),
                 osp.join(track_save_dir, 'cover.label'))
        copyfile(osp.join(video_path, 'cut_by_image.label'),
                 osp.join(track_save_dir, 'cut_by_image.label'))
        copyfile(osp.join(video_path, 'meta_info.ini'),
                 osp.join(track_save_dir, 'meta_info.ini'))

        for i, jpg in enumerate(jpg_files):
            # Read image
            img_file = jpg.split('/')[-1]
            img = None

            # Get all object bounding boxes
            jpgidx = img_file.split('.')[0]
            jpgidx = int(jpgidx) - 1
            ann = ann_content[jpgidx]
            ann = ann.strip()
            bbox = ann.split(',')
            bbox = [int(float(bb)) for bb in bbox]  # [xmin, ymin, w, h]

            ## bbox ####
            annk = ann_content[i]
            annk = annk.strip()
            bboxk = annk.split(',')
            bboxk = [int(float(bb)) for bb in bboxk]  # [xmin, ymin, w, h]

            w = bboxk[2]
            h = bboxk[3]
            context_amount = 0.5
            size_z = 127
            size_x = 271

            wc_z = w + context_amount * (w + h)
            hc_z = h + context_amount * (w + h)
            s_z = np.sqrt(wc_z * hc_z)
            scale_z = size_z / s_z
            d_search = (size_x - size_z) / 2
            pad = d_search / scale_z
            s_x = s_z + 2 * pad

            wn = int(w * size_x / s_x)
            hn = int(h * size_x / s_x)

            #if wn < 1 or hn < 1:
            #if wn == 0:
            #wn = 1
            #if hn == 0:
            #hn = 1
            #ratio = wn / hn
            #if ratio > 1.:
            #newbb = [int(135-wn/2), int(135-hn/2), 85, int(85. / ratio)]
            #else:
            #newbb = [int(135-wn/2), int(135-hn/2), int(85. * ratio), 85]
            #else:
            #newbb = [int(135-wn/2), int(135-hn/2), wn, hn]

            if wn < 1:
                wn = 1
            if hn < 1:
                hn = 1
            newbb = [int(135 - wn / 2), int(135 - hn / 2), wn, hn]

            fw.write(','.join(str(e) + '.0000' for e in newbb) + '\n')
            ## bbox ####

            savename = osp.join(track_save_dir, '{}.jpg'.format(img_file))

            if osp.isfile(savename):
                try:
                    im = Image.open(savename)
                    continue  # skip existing images
                except IOError:
                    os.remove(savename)

            if img is None:
                img = imread(jpg)

            # Get crop
            target_box = convert_bbox_format(Rectangle(*bbox), 'center-based')
            crop, _ = get_crops(img,
                                target_box,
                                size_z=127,
                                size_x=271,
                                context_amount=0.5)
            imwrite(savename, crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])

        fw.close()
Beispiel #18
0
    def track(self, sess, frame, logdir='/tmp'):
        """Runs tracking on a single image."""
        i = self.i = self.i + 1
        current_target_state = self.current_target_state
        original_target_height = self.original_target_height
        original_target_width = self.original_target_width
        search_center = self.search_center

        mem_count = self.mem_count
        moved2border = self.moved2border
        update_delay = self.update_delay + 1
        lost = self.lost + 1
        image_c = self.image_c
        x_image_size = self.x_image_size
        search_factors = self.search_factors_init
        conf_thresh = self.conf_thresh
        bound_thresh = self.bound_thresh
        sup_thresh = self.sup_thresh
        prev_score = self.prev_score

        hi, wi, _ = frame.shape
        h_ratio = current_target_state.bbox.height / hi
        w_ratio = current_target_state.bbox.width / wi
        t_i_ratio = max([h_ratio, w_ratio])

        if prev_score < conf_thresh:
            x_image_size += 100
            #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init)
            if t_i_ratio < 0.05:
                x_image_size = min(x_image_size, 555)
            elif t_i_ratio > 0.6:
                x_image_size = min(x_image_size, 255)
            elif t_i_ratio > 0.4:
                x_image_size = min(x_image_size, 355)
            else:
                x_image_size = min(x_image_size, 455)
        else:
            x_image_size = self.x_image_size_init

        num_scales = len(search_factors)
        bbx = current_target_state.bbox.x
        bby = current_target_state.bbox.y
        bbw = current_target_state.bbox.width
        bbh = current_target_state.bbox.height
        bbox_feed = [bby, bbx, bbh, bbw]

        if i > 1:
            top = (current_target_state.bbox.y -
                   (current_target_state.bbox.height / 2) < 10)
            left = (current_target_state.bbox.x -
                    (current_target_state.bbox.width / 2) < 10)
            bottom = (current_target_state.bbox.y +
                      (current_target_state.bbox.height / 2) > hi - 10)
            right = (current_target_state.bbox.x +
                     (current_target_state.bbox.width / 2) > wi - 10)
            if top or left or bottom or right:
                if not prev_score < bound_thresh:
                    moved2border = True
                if not moved2border:
                    current_target_state.bbox = Rectangle(
                        wi / 2, hi / 2, current_target_state.bbox.width,
                        current_target_state.bbox.height)
                    bbox_feed = [
                        current_target_state.bbox.y,
                        current_target_state.bbox.x,
                        current_target_state.bbox.height,
                        current_target_state.bbox.width
                    ]
            else:
                if not prev_score < bound_thresh:
                    moved2border = False

        if t_i_ratio < 0.3 and lost > 5:
            lost = 0
            diffy = hi * 0.5 - bbox_feed[0]
            diffx = wi * 0.5 - bbox_feed[1]
            bbox_feed = [
                diffy * 0.25 + bbox_feed[0], diffx * 0.25 + bbox_feed[1],
                bbox_feed[2], bbox_feed[3]
            ]

        current_target_state.bbox = Rectangle(bbox_feed[1], bbox_feed[0],
                                              bbox_feed[3], bbox_feed[2])

        input_feed = [frame, bbox_feed, x_image_size, search_factors]
        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        response_size = response.shape[1]
        re_out = np.around(1 / (1 + np.exp(-response)), 2)

        if np.max(re_out) < conf_thresh:
            x_image_sizeb4 = x_image_size
            x_image_size += 100
            #x_image_size_l = ((1. - t_i_ratio) * 1.8 + 1.) * self.x_image_size_init
            if t_i_ratio < 0.05:
                x_image_size_l = 555
            elif t_i_ratio > 0.6:
                x_image_size_l = 255
            elif t_i_ratio > 0.4:
                x_image_size_l = 355
            else:
                x_image_size_l = 455

            if not x_image_size > x_image_size_l:
                input_feed = [frame, bbox_feed, x_image_size, search_factors]
                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]
                re_out = np.around(1 / (1 + np.exp(-response)), 2)
            else:
                x_image_size = x_image_sizeb4

        # Choose the scale whole response map has the highest peak
        if num_scales > 1:
            response_max = np.max(response * (re_out > sup_thresh),
                                  axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                (num_scales))
            current_scale_idx = int(get_center(num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            if max(response_penalized) == 0.:
                best_scale = 1
            else:
                best_scale = np.argmax(response_penalized)
        else:
            best_scale = 0

        response = response[best_scale]
        re_out = re_out[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)
            response = response * (re_out > sup_thresh)

        window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
        self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (
            1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)
        prev_score = re_out[r_max, c_max]

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])
        # displacement from the center in instance final representation ...
        disp_instance_final = p_coor - get_center(response_size)
        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)
        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config[
            'embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = current_target_state.bbox.y
        x = current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]
        y = np.round(y)
        x = np.round(x)

        # Target scale damping and saturation
        target_scale = current_target_state.bbox.height / original_target_height
        search_factor = search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)

        # Some book keeping
        search_center = np.array(
            [get_center(x_image_size),
             get_center(x_image_size)])
        height = original_target_height * target_scale
        width = original_target_width * target_scale
        current_target_state.bbox = Rectangle(x, y, width, height)
        current_target_state.scale_idx = best_scale
        current_target_state.search_pos = search_center + disp_instance_input

        assert 0 <= current_target_state.search_pos[0] < x_image_size, \
          'target position in feature space should be no larger than input image size'
        assert 0 <= current_target_state.search_pos[1] < x_image_size, \
          'target position in feature space should be no larger than input image size'

        if self.log_level > 0:
            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(
                np.uint8)

            y_search, x_search = current_target_state.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search,
                                    target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')

            xmin = bbox_search.x.astype(np.int32)
            ymin = bbox_search.y.astype(np.int32)
            xmax = xmin + bbox_search.width.astype(np.int32)
            ymax = ymin + bbox_search.height.astype(np.int32)
            cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax),
                          (255, 0, 0), 2)
            text = str(prev_score)
            cv2.putText(image_cropped,
                        text, (xmin, ymin),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1.0, (255, 0, 0),
                        lineType=cv2.LINE_AA)
            imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                    cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

        if prev_score > self.store_thresh:
            bbox_feed = [
                current_target_state.bbox.y, current_target_state.bbox.x,
                current_target_state.bbox.height,
                current_target_state.bbox.width
            ]
            self.siamese_model.update_mem(sess, [
                frame, bbox_feed, self.x_image_size_init,
                self.search_factors_init, mem_count
            ])
            mem_count += 1

        if mem_count > 4 or (mem_count > 0 and update_delay > 5):
            self.siamese_model.update(sess)
            mem_count = 0
            update_delay = 0

        if prev_score > bound_thresh:
            lost = 0

        self.mem_count = mem_count
        self.update_delay = update_delay
        self.moved2border = moved2border
        self.lost = lost
        self.x_image_size = x_image_size
        self.prev_score = prev_score
        self.current_target_state = current_target_state
        reported_bbox = convert_bbox_format(current_target_state.bbox,
                                            'top-left-based')
        #return prev_score>0.4, reported_bbox, prev_score
        return prev_score > 0.4, reported_bbox
Beispiel #19
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [
            frames[0], bbox_feed, self.x_image_size, self.search_factors
        ]
        frame2crop_scale, image_z = self.siamese_model.initialize(
            sess, input_feed)
        imwrite(osp.join(logdir, 'aimagez.jpg'),
                cv2.cvtColor(image_z, cv2.COLOR_RGB2BGR))

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        image_c = None
        x_image_size = self.x_image_size
        lost = 0
        moved2border = False

        conf_thresh = 0.2  # 0.2
        bound_thresh = 0.2  # 0.2
        sup_thresh = 0.15  # 0.15
        prev_score = conf_thresh + 0.01
        upsample_factor = self.track_config['upsample_factor']
        search_factors = self.search_factors

        for i, filename in enumerate(frames):
            if i > 0 or include_first:
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]

                if prev_score > bound_thresh:
                    lost = 0
                else:
                    lost += 1

                if prev_score > 0.9:
                    self.siamese_model.update(sess, [
                        frames[i - 1], bbox_feed, self.x_image_size,
                        search_factors
                    ])

                with open(filename, 'rb') as f:
                    wi, hi = GetWidthAndHeight(f)
                t_i_ratio = max([
                    current_target_state.bbox.height / hi,
                    current_target_state.bbox.width / wi
                ])

                if prev_score < conf_thresh:
                    x_image_size += 100
                    #x_image_size = min(x_image_size, ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init)
                    if t_i_ratio < 0.05:
                        x_image_size = min(x_image_size, 555)
                    elif t_i_ratio < 0.25:
                        x_image_size = min(x_image_size, 455)
                    elif t_i_ratio > 0.5:
                        x_image_size = min(x_image_size, 255)
                    else:
                        x_image_size = min(x_image_size, 355)
                else:
                    x_image_size = self.x_image_size

                if i > 1:
                    top = (current_target_state.bbox.y -
                           (current_target_state.bbox.height / 2) < 10)
                    left = (current_target_state.bbox.x -
                            (current_target_state.bbox.width / 2) < 10)
                    bottom = (current_target_state.bbox.y +
                              (current_target_state.bbox.height / 2) > hi - 10)
                    right = (current_target_state.bbox.x +
                             (current_target_state.bbox.width / 2) > wi - 10)
                    bound_flag = top or left or bottom or right
                    #if top or left or bottom or right:
                    #if not prev_score < bound_thresh:
                    #moved2border = True
                    #if not moved2border:
                    #current_target_state.bbox = Rectangle(wi / 2, hi / 2,
                    #current_target_state.bbox.width,
                    #current_target_state.bbox.height)
                    #bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                    #current_target_state.bbox.height, current_target_state.bbox.width]
                    #else:
                    #if not prev_score < bound_thresh:
                    #moved2border = False

                if lost > 5 and bound_flag:
                    lost = 0
                    diffy = hi * 0.5 - bbox_feed[0]
                    diffx = wi * 0.5 - bbox_feed[1]
                    bbox_feed = [
                        diffy * 0.25 + bbox_feed[0],
                        diffx * 0.25 + bbox_feed[1], bbox_feed[2], bbox_feed[3]
                    ]

                current_target_state.bbox = Rectangle(bbox_feed[1],
                                                      bbox_feed[0],
                                                      bbox_feed[3],
                                                      bbox_feed[2])

                input_feed = [
                    filename, bbox_feed, x_image_size, search_factors
                ]
                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]
                re_out = np.around(1 / (1 + np.exp(-response)), 2)

                if np.max(re_out) < conf_thresh and not t_i_ratio > 0.5:
                    x_image_sizeb4 = x_image_size
                    x_image_size += 100
                    #x_image_size_l = ((1. - t_i_ratio) * 1.6 + 1.) * self.x_image_size_init
                    if t_i_ratio < 0.05:
                        x_image_size_l = 555
                    elif t_i_ratio < 0.25:
                        x_image_size_l = 455
                    elif t_i_ratio > 0.5:
                        x_image_size_l = 255
                    else:
                        x_image_size_l = 355

                    if not x_image_size > x_image_size_l:
                        input_feed = [
                            filename, bbox_feed, x_image_size, search_factors
                        ]
                        outputs, metadata = self.siamese_model.inference_step(
                            sess, input_feed)
                        search_scale_list = outputs['scale_xs']
                        response = outputs['response']
                        response_size = response.shape[1]
                        re_out = np.around(1 / (1 + np.exp(-response)), 2)
                    else:
                        x_image_size = x_image_sizeb4

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response * (re_out > sup_thresh),
                                          axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    if max(response_penalized) == 0.:
                        best_scale = 1
                    else:
                        best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                re_out = re_out[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)
                    response = response * (re_out > sup_thresh)

                window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                                np.expand_dims(np.hanning(response_size), 0))
                self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                if np.max(re_out) < sup_thresh:
                    r_max, c_max = response.shape
                    r_max, c_max = int(r_max / 2), int(c_max / 2)
                    disp_instance_input = [0, 0]
                    disp_instance_frame = [0, 0]
                else:
                    # Find maximum response
                    r_max, c_max = np.unravel_index(response.argmax(),
                                                    response.shape)

                    # Convert from crop-relative coordinates to frame coordinates
                    p_coor = np.array([r_max, c_max])
                    # displacement from the center in instance final representation ...
                    disp_instance_final = p_coor - get_center(response_size)
                    # ... in instance feature space ...
                    disp_instance_feat = disp_instance_final / upsample_factor
                    # ... Avoid empty position ...
                    r_radius = int(response_size / upsample_factor / 2)
                    disp_instance_feat = np.maximum(
                        np.minimum(disp_instance_feat, r_radius), -r_radius)
                    # ... in instance input ...
                    disp_instance_input = disp_instance_feat * self.model_config[
                        'embed_config']['stride']
                    # ... in instance original crop (in frame coordinates)
                    disp_instance_frame = disp_instance_input / search_scale_list[
                        best_scale]

                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]
                y = np.round(y)
                x = np.round(x)
                prev_score = re_out[r_max, c_max]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)

                # Some book keeping
                search_center = np.array(
                    [get_center(x_image_size),
                     get_center(x_image_size)])
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')

                    # Add score colormap
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    #im_shape = image_cropped.shape
                    #re_shape = response_size / upsample_factor * self.model_config['embed_config']['stride']
                    #pad = int((im_shape[0] - re_shape) / 2)
                    #response_crop = imresize(re_out, [im_shape[0]-2*pad, im_shape[1]-2*pad])
                    #response_crop = np.pad(response_crop, ((pad, pad), (pad, pad)), 'constant')
                    #response_crop = response_crop / response_crop.max()
                    #response_crop = np.uint8(response_crop * 255)
                    #cmap = cv2.cvtColor(cv2.applyColorMap(response_crop, cv2.COLORMAP_JET), cv2.COLOR_BGR2RGB)
                    #image_cropped = cv2.addWeighted(cmap, 0.3, image_cropped, 0.5, 0)

                    xmin = bbox_search.x.astype(np.int32)
                    ymin = bbox_search.y.astype(np.int32)
                    xmax = xmin + bbox_search.width.astype(np.int32)
                    ymax = ymin + bbox_search.height.astype(np.int32)
                    cv2.rectangle(image_cropped, (xmin, ymin), (xmax, ymax),
                                  (255, 0, 0), 2)
                    text = str(prev_score)
                    cv2.putText(image_cropped,
                                text, (xmin, ymin),
                                cv2.FONT_HERSHEY_SIMPLEX,
                                1.0, (255, 0, 0),
                                lineType=cv2.LINE_AA)
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    #if image_c is not None:
                    #his_dir = logdir + '_his'
                    #if not osp.exists(his_dir):
                    #os.mkdir(his_dir)
                    #image_c_p = np.concatenate([np.expand_dims(image_z, 0)] + image_c, 2)[0]
                    #image_c_p = np.uint8(image_c_p)
                    #imwrite(osp.join(his_dir, 'image{}.jpg'.format(i)),
                    #cv2.cvtColor(image_c_p, cv2.COLOR_RGB2BGR))

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Beispiel #20
0
  def track(self, sess, first_bbox, frames, logdir='/tmp'):
    """Runs tracking on a single image sequence."""
    # Get initial target bounding box and convert to center based
    bbox = convert_bbox_format(first_bbox, 'center-based')
    print(frames)
    # Feed in the first frame image to set initial state.
    bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
    input_feed = [frames[0], bbox_feed]
    frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

    # Storing target state
    original_target_height = bbox.height
    original_target_width = bbox.width
    search_center = np.array([get_center(self.x_image_size),
                              get_center(self.x_image_size)])
    current_target_state = TargetState(bbox=bbox,
                                       search_pos=search_center,
                                       scale_idx=int(get_center(self.num_scales)))

    include_first = get(self.track_config, 'include_first', False)
    logging.info('Tracking include first -- {}'.format(include_first))

    # Run tracking loop
    reported_bboxs = []
    output_json={} #dump all bboxes in this output file

    for i, filename in enumerate(frames):
      if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
        bbox_feed = [current_target_state.bbox.y, current_target_state.bbox.x,
                     current_target_state.bbox.height, current_target_state.bbox.width]
        input_feed = [filename, bbox_feed]

        outputs, metadata = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response']
        
        response_size = response.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
          response_max = np.max(response, axis=(1, 2))
          penalties = self.track_config['scale_penalty'] * np.ones((self.num_scales))
          current_scale_idx = int(get_center(self.num_scales))
          penalties[current_scale_idx] = 1.0
          response_penalized = response_max * penalties
          best_scale = np.argmax(response_penalized)
        else:
          best_scale = 0

        response = response[best_scale]
        #print(response)
        

        with np.errstate(all='raise'):  # Raise error if something goes wrong
          response = response - np.min(response)
          response = response / np.sum(response)

        if self.window is None:
          window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                          np.expand_dims(np.hanning(response_size), 0))
          self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']
        response = (1 - window_influence) * response + window_influence * self.window

        # Find maximum response
        srtd=response.argsort(axis=None)
        v =  response.argmax()
        r_max, c_max = np.unravel_index(v,
                                        response.shape)


        if not osp.exists(osp.join(logdir,"Intermediate")):
          os.mkdir(osp.join(logdir,"Intermediate"))

        to_save = np.interp(response,(response.min(),response.max()),(0,255))
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}.png"),to_save)
        
        to_save = to_save.reshape(to_save.shape[0],to_save.shape[1],1)
        ret,thresh1 = cv2.threshold(to_save,185,255,cv2.THRESH_BINARY)
        
        
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_thresh.png"),thresh1)
        image = np.uint8(thresh1.copy())
        
        cnts = cv2.findContours(image, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
        cnts = imutils.grab_contours(cnts)
        backtorgb = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
        image = cv2.drawContours(backtorgb, cnts, -1, (0, 255, 0), 2)
        cv2.imwrite(osp.join(logdir,"Intermediate",f"response_{i}_cntrs.png"),image)
        
        centres=[]
        for c in cnts:
          M = cv2.moments(c)
          cX = int(M["m10"] / M["m00"])
          cY = int(M["m01"] / M["m00"])
          centres.append((cY,cX,False))
        centres.append((r_max,c_max,True))
        #print(centres)

        #cts_copy = copy(current_target_state)
        #cts_copy2 = copy(current_target_state)
        output_json[filename]=[]

        for (r_max,c_max,to_deep_copy) in centres:
          if to_deep_copy:
            cts_copy = deepcopy(current_target_state)
          else:
            cts_copy = copy(current_target_state)
          # Convert from crop-relative coordinates to frame coordinates
          p_coor = np.array([r_max, c_max])
          # displacement from the center in instance final representation ...
          disp_instance_final = p_coor - get_center(response_size)
          # ... in instance feature space ...
          upsample_factor = self.track_config['upsample_factor']
          disp_instance_feat = disp_instance_final / upsample_factor
          # ... Avoid empty position ...
          r_radius = int(response_size / upsample_factor / 2)
          disp_instance_feat = np.maximum(np.minimum(disp_instance_feat, r_radius), -r_radius)
          # ... in instance input ...
          disp_instance_input = disp_instance_feat * self.model_config['embed_config']['stride']
          # ... in instance original crop (in frame coordinates)
          disp_instance_frame = disp_instance_input / search_scale_list[best_scale]
          # Position within frame in frame coordinates
          y = cts_copy.bbox.y
          x = cts_copy.bbox.x
          y += disp_instance_frame[0]
          x += disp_instance_frame[1]

          # Target scale damping and saturation
          target_scale = cts_copy.bbox.height / original_target_height
          search_factor = self.search_factors[best_scale]
          scale_damp = self.track_config['scale_damp']  # damping factor for scale update
          target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
          target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

          # Some book keeping
          height = original_target_height * target_scale
          width = original_target_width * target_scale
          
          cts_copy.bbox = Rectangle(x, y, width, height)
          cts_copy.scale_idx = best_scale
          cts_copy.search_pos = search_center + disp_instance_input

          assert 0 <= cts_copy.search_pos[0] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'
          assert 0 <= cts_copy.search_pos[1] < self.x_image_size, \
            'target position in feature space should be no larger than input image size'

          if self.log_level > 0 and to_deep_copy:
            np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

            # Select the image with the highest score scale and convert it to uint8
            image_cropped = outputs['image_cropped'][best_scale].astype(np.uint8)
            # Note that imwrite in cv2 assumes the image is in BGR format.
            # However, the cropped image returned by TensorFlow is RGB.
            # Therefore, we convert color format using cv2.cvtColor
            imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                    cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

            np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)), [best_scale])
            np.save(osp.join(logdir, 'response{}.npy'.format(i)), response)

            y_search, x_search = cts_copy.search_pos
            search_scale = search_scale_list[best_scale]
            target_height_search = height * search_scale
            target_width_search = width * search_scale
            bbox_search = Rectangle(x_search, y_search, target_width_search, target_height_search)
            bbox_search = convert_bbox_format(bbox_search, 'top-left-based')
            np.save(osp.join(logdir, 'bbox{}.npy'.format(i)),
                    [bbox_search.x, bbox_search.y, bbox_search.width, bbox_search.height])

          reported_bbox = convert_bbox_format(cts_copy.bbox, 'top-left-based')
          #print(f"reported bbox {reported_bbox}")
          if to_deep_copy:
            reported_bboxs.append(reported_bbox)
          else:
            rect_str = '{},{},{},{}\n'.format(reported_bbox.x + 1, reported_bbox.y + 1,
                                              reported_bbox.width, reported_bbox.height)
            arr = output_json[filename]
            arr.append(rect_str)


    
    with open(osp.join(logdir,'bboxes.json'),'w') as f:
      json.dump(output_json,f,indent=4)
    return reported_bboxs
Beispiel #21
0
 def set_first_frame(self, frame, r):
     first_line = "{},{},{},{}".format(r[0], r[1], r[2], r[3])
     bb = [int(v) for v in first_line.strip().split(',')]
     init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2],
                         bb[3])  # 0-index in python
     self.tracker.initialize(self.sess, init_bb, frame, self.video_log_dir)
Beispiel #22
0
    def track(self, first_bbox, frames, bSaveImage=False, SavePath='/tmp'):
        #1. init the tracker
        self.track_init(first_bbox, frames[0])
        include_first = self.track_config['include_first']
        # Run tracking loop
        reported_bboxs = []
        examplar = np.reshape(self.first_image_examplar,
                              [1, self.z_image_size, self.z_image_size, 3])

        cost_time_dict = {
            'load_img': 0.0,
            'crop_img': 0.0,
            'sess_run': 0.0,
            'post_process': 0.0
        }
        for i, filename in tqdm(enumerate(frames)):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                load_img_start = time.time()
                bgr_img = safe_imread(filename)
                load_img_end = time.time()
                cost_time_dict['load_img'] += load_img_end - load_img_start
                crop_img_start = time.time()

                current_img = cv2.cvtColor(
                    bgr_img,
                    cv2.COLOR_BGR2RGB) if self.image_use_rgb else bgr_img
                instance_img, scale_x, _ = get_crops(
                    current_img, self.current_target_state.search_box,
                    self.z_image_size, self.x_image_size, 0.5)
                instance = np.reshape(
                    instance_img, [1, self.x_image_size, self.x_image_size, 3])
                crop_img_end = time.time()
                cost_time_dict['crop_img'] += crop_img_end - crop_img_start

                sess_run_start = time.time()
                if self.model.model_config.get('BinWindow', False):
                    boxes, scores = self.sess.run(
                        [self.model.topk_bboxes, self.model.topk_scores],
                        feed_dict={
                            self.model.examplar_feed: examplar,
                            self.model.instance_feed: instance,
                            self.model.gt_examplar_boxes:
                            self.gt_examplar_boxes
                        })
                else:
                    boxes, scores = self.sess.run(
                        [self.model.topk_bboxes, self.model.topk_scores],
                        feed_dict={
                            self.model.examplar_feed: examplar,
                            self.model.instance_feed: instance
                        })
                sess_run_end = time.time()
                cost_time_dict['sess_run'] += sess_run_end - sess_run_start

                post_process_start = time.time()

                def padded_size(w, h):
                    context = 0.5 * (w + h)
                    return np.sqrt((w + context) * (h + context))

                #boxes: 1*NA*4 score: 1*Na
                boxes = boxes[0]  #NA*4
                scores = scores[0]  #NA*2
                scales = padded_size(
                    (boxes[:, 2] - boxes[:, 0]) / scale_x,
                    (boxes[:, 3] - boxes[:, 1]) / scale_x)  #Na
                ratios = (boxes[:, 3] - boxes[:, 1]) / (boxes[:, 2] -
                                                        boxes[:, 0])

                scale_change = scales / self.current_target_state.scale
                scale_change = np.maximum(scale_change, 1.0 / scale_change)
                ratio_change = ratios / (self.current_target_state.ratio)
                ratio_change = np.maximum(ratio_change, 1.0 / ratio_change)
                scale_penalty = np.exp(-(scale_change * ratio_change - 1) *
                                       self.track_config['penalty_k'])
                pscores = scores * scale_penalty

                window_influence = self.track_config['window_influence']
                wpscores = pscores * (
                    1 - window_influence) + self.window * window_influence

                max_index = np.argmax(wpscores)
                corrdinates = boxes[max_index]  #Top1
                #print("Tracking %d/%d with tracking score:%.2f, wpscore: %.2f"%(i+1, len(frames), scores[max_index],wpscores[max_index]))

                # Position within frame in frame coordinates
                res_box = Rectangle(*corrdinate_to_bbox(corrdinates))
                center_x = (self.x_image_size - 1.0) / 2
                center_y = center_x

                delta_x = (res_box.x - center_x) / scale_x
                delta_y = (res_box.y - center_y) / scale_x

                w = res_box.width / scale_x
                h = res_box.height / scale_x
                y = self.current_target_state.target_box.y + delta_y
                x = self.current_target_state.target_box.x + delta_x

                #update seach bbox
                alpha = self.track_config[
                    'search_scale_smooth_factor'] * pscores[max_index]
                belta = 0.0
                new_search_cx = max(
                    min(
                        self.img_width,
                        self.current_target_state.target_box.x * belta +
                        (1.0 - belta) * x), 0.0)
                new_search_cy = max(
                    min(
                        self.img_height,
                        self.current_target_state.target_box.y * belta +
                        (1.0 - belta) * y), 0.0)
                new_search_w = max(
                    10.0,
                    min(
                        self.current_target_state.target_box.width *
                        (1.0 - alpha) + alpha * w, self.img_width))
                new_search_h = max(
                    10.0,
                    min(
                        self.current_target_state.target_box.height *
                        (1.0 - alpha) + alpha * h, self.img_height))
                self.current_target_state.target_box = Rectangle(
                    new_search_cx, new_search_cy, new_search_w, new_search_h)
                self.current_target_state.scale = padded_size(
                    new_search_w, new_search_h)
                self.current_target_state.ratio = new_search_h * 1.0 / new_search_w

                #auto increase the search region if max score is lower than the conf_threshold
                if (scores[max_index] < self.conf_threshold
                        and self.auto_increase):
                    increase_w = min(new_search_w * 1.5, self.img_width)
                    increase_h = min(new_search_h * 1.5, self.img_height)
                    self.current_target_state.search_box = Rectangle(
                        new_search_cx, new_search_cy, increase_w, increase_h)
                else:
                    self.current_target_state.search_box = self.current_target_state.target_box

                #save and show tracking process
                if bSaveImage:
                    cv2.imwrite(SavePath + "/" + os.path.basename(frames[i]),
                                bgr_img)
                elif self.save_video:
                    x1, y1, x2, y2 = bbox_to_corrdinate(
                        self.current_target_state.search_box)
                    cv2.rectangle(bgr_img, (int(x1), int(y1)),
                                  (int(x2), int(y2)), (0, 255, 0), 2)
                    cv2.putText(bgr_img, "%.2f" % (scores[max_index]),
                                (int(x1), int(y1)), 0, 1, (0, 255, 0), 2)
                    self.video.write(bgr_img)
                elif self.show_video:
                    x1, y1, x2, y2 = bbox_to_corrdinate(
                        self.current_target_state.search_box)
                    cv2.rectangle(bgr_img, (int(x1), int(y1)),
                                  (int(x2), int(y2)), (0, 255, 0), 2)
                    cv2.putText(bgr_img, "%.2f" % (scores[max_index]),
                                (int(x1), int(y1)), 0, 1, (0, 255, 0), 2)
                    cv2.imshow("Tracker", bgr_img)
                    cv2.waitKey(10)
                else:
                    pass
                post_process_end = time.time()
                cost_time_dict[
                    'post_process'] += post_process_end - post_process_start

            else:
                x1, y1, x2, y2 = bbox_to_corrdinate(
                    self.current_target_state.search_box)
                cv2.rectangle(self.first_frame_image, (int(x1), int(y1)),
                              (int(x2), int(y2)), (255, 255, 255), 2)
                #cv2.imshow("Tracker",cv2.cvtColor(self.first_frame_image, cv2.COLOR_RGB2BGR))
                #cv2.imshow("Target",self.first_frame_image)
                #cv2.waitKey(100)

            reported_bbox = convert_bbox_format(
                self.current_target_state.target_box, 'top-left-based')
            reported_bboxs.append(reported_bbox)

        for key in cost_time_dict:
            cost_time_dict[key] /= len(frames)
        #print(cost_time_dict)
        return reported_bboxs
Beispiel #23
0
    av1 = tf.all_variables()
    tracker = Tracker(model, model_config=model_config, track_config=track_config)

    for video_dir in video_dirs:
      if not osp.isdir(video_dir):
        logging.warning('{} is not a directory, skipping...'.format(video_dir))
        continue

      video_name = osp.basename(video_dir)
      video_log_dir = "tmp"
      mkdir_p(video_log_dir)

      filenames = sort_nicely(glob(video_dir + '/img/*.jpg'))
      first_line = open(video_dir + '/groundtruth_rect.txt').readline()
      bb = [int(v) for v in first_line.strip().replace(' ', ',').replace('\t', ',').split(',')]
      init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2], bb[3])  # 0-index in python
      
      print("######{0},{1}".format(video_dir, len(filenames)))
# =============================================================================
#       for i in range(10):
#           print ("fixed classid: {}".format(i))
#           trajectory = tracker.track(sess, init_bb, filenames, video_log_dir, str(i))
#           with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f:
#             for region in trajectory:
#               rect_str = '{},{},{},{}\n'.format(region.x + 1, region.y + 1,
#                                                 region.width, region.height)
#               f.write(rect_str)
#       
#           gt_bboxs = readbbox(osp.join(input_files, 'groundtruth_rect.txt'))
#           pred_bboxs = readbbox(osp.join(video_log_dir, 'track_rect.txt'))
#           print ("MulSiamFC class  --- {0}  IOU --- {1}".format(str(i), cal_IOU(pred_bboxs, gt_bboxs)))
Beispiel #24
0
    def track_init(self, first_bbox, first_frame_image_path):
        print(first_frame_image_path)
        first_frame_image = safe_imread(first_frame_image_path)
        self.first_frame_image = cv2.cvtColor(
            first_frame_image,
            cv2.COLOR_BGR2RGB) if self.image_use_rgb else first_frame_image

        self.first_bbox = convert_bbox_format(
            Rectangle(first_bbox[0], first_bbox[1], first_bbox[2],
                      first_bbox[3]), 'center-based')
        first_image_crop, _, target_size = get_crops(self.first_frame_image,
                                                     self.first_bbox,
                                                     self.z_image_size,
                                                     self.x_image_size, 0.5)

        cx = (self.x_image_size - 1) / 2.0
        cy = (self.x_image_size - 1) / 2.0
        gt_examplar_box = np.array([
            cx - target_size[0] / 2.0, cy - target_size[1] / 2.0,
            cx + target_size[0] / 2.0, cy + target_size[1] / 2.0
        ], np.float32)

        self.img_height, self.img_width, _ = self.first_frame_image.shape

        if self.save_video:
            video_name = first_frame_image_path.split('/')[-3] + '.mp4'
            fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
            result_dir = os.path.join(Project_root,
                                      self.track_config['log_dir'])
            if not os.path.exists(result_dir):
                os.makedirs(result_dir)
            video_path = os.path.join(result_dir, video_name)
            print("save video into %s" % (video_path))
            self.video = cv2.VideoWriter(video_path, fourcc, 30,
                                         (self.img_width, self.img_height))

        def center_crop(img, crop_size=127):
            img_shape = np.shape(img)
            center_y = (img_shape[0] - 1) // 2
            center_x = (img_shape[1] - 1) // 2
            h = crop_size
            w = crop_size
            croped_img = img[center_y - h // 2:center_y + h // 2 + 1,
                             center_x - w // 2:center_x + w // 2 + 1]
            assert (croped_img.shape[0] == crop_size)
            return croped_img

        self.first_image_examplar = center_crop(first_image_crop,
                                                self.z_image_size)

        shift_y = (self.x_image_size - self.z_image_size) // 2
        shift_x = shift_y
        x1 = gt_examplar_box[0] - shift_x
        y1 = gt_examplar_box[1] - shift_y
        x2 = gt_examplar_box[2] - shift_x
        y2 = gt_examplar_box[3] - shift_y
        self.gt_examplar_boxes = np.reshape(np.array([x1, y1, x2, y2]), [1, 4])

        self.current_target_state = TargetState(bbox=self.first_bbox)
        self.window = np.tile(
            np.outer(np.hanning(self.score_size),
                     np.hanning(self.score_size)).flatten(),
            5)  #5 is the number of aspect ratio anchors
Beispiel #25
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        smooth_rate = self.track_config['smooth']
        update_interval = self.track_config['update_interval']
        feature_balance = self.track_config['feature_balance']

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)
        examplar = self.siamese_model.get_examplar(sess, input_feed)
        examplar_smooth = examplar
        st_template = []
        for i in range(self.siamese_model.train_config['time_range']):
            st_template.append(examplar)
        st_template_np = np.array(st_template)
        self.siamese_model.update_st_template_step(sess, st_template_np)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Set padding for refining search region
        img = mpimg.imread(frames[0])
        context_amount = self.track_config['context_amount']
        size_z = self.model_config['z_image_size']
        size_x = self.track_config['x_image_size']
        padding_h = 10
        padding_w = 10

        if original_target_height / original_target_width > 2:  #2
            padding_h = 1.4  #1.4
            padding_w = 6

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response2 = outputs['response2']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response2, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                else:
                    best_scale = 0

                response = response[best_scale]
                response2 = response2[best_scale]
                response = feature_balance * response + (
                    1 - feature_balance) * response2
                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Refine the response
                base_z_size = np.array([
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ])
                base_z_context_size = base_z_size + context_amount * np.sum(
                    base_z_size)
                base_s_z = np.sqrt(
                    np.prod(base_z_context_size))  # Canonical size
                base_scale_z = size_z / base_s_z
                d_search = (size_x - size_z) / 2.0
                base_pad = d_search / base_scale_z
                base_s_x = base_s_z + 2 * base_pad

                if base_s_x / current_target_state.bbox.height > padding_h:
                    start_h = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.height * padding_h) /
                        (2 * base_s_x))
                    end_h = np.floor(response_size - start_h)
                    start_h = np.int(start_h)
                    end_h = np.int(end_h)
                    response[0:start_h, :] = 0
                    response[end_h:-1, :] = 0
                if base_s_x / current_target_state.bbox.width > padding_w:
                    start_w = np.ceil(
                        response_size *
                        (base_s_x -
                         current_target_state.bbox.width * padding_w) /
                        (2 * base_s_x))
                    end_w = np.floor(response_size - start_w)
                    start_w = np.int(start_w)
                    end_w = np.int(end_w)
                    response[:, :start_w] = 0
                    response[:, end_w:] = 0

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                # Update the spatial-temporal template using gcn
                if i % update_interval == 0:
                    bbox_feed = [
                        current_target_state.bbox.y,
                        current_target_state.bbox.x,
                        current_target_state.bbox.height,
                        current_target_state.bbox.width
                    ]
                    input_feed = [filename, bbox_feed]
                    current_examplar = self.siamese_model.get_examplar(
                        sess, input_feed)
                    # examplar_smooth[2:4,2:4,:] = current_examplar[2:4,2:4,:]
                    examplar_smooth = current_examplar
                    current_examplar = smooth_rate * examplar_smooth + (
                        1 - smooth_rate) * examplar
                    st_template.pop(1)
                    st_template.append(current_examplar)
                    st_template_np = np.array(st_template)
                    self.siamese_model.update_st_template_step(
                        sess, st_template_np)

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Beispiel #26
0
    def track(self, sess, first_bbox, frames, logdir='/tmp'):
        """Runs tracking on a single image sequence."""
        # Get initial target bounding box and convert to center based
        bbox = convert_bbox_format(first_bbox, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        input_feed = [frames[0], bbox_feed]
        frame2crop_scale = self.siamese_model.initialize(sess, input_feed)

        # Storing target state
        original_target_height = bbox.height
        original_target_width = bbox.width
        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(bbox=bbox,
                                           search_pos=search_center,
                                           scale_idx=int(
                                               get_center(self.num_scales)))

        include_first = get(self.track_config, 'include_first', False)
        logging.info('Tracking include first -- {}'.format(include_first))

        # Run tracking loop
        reported_bboxs = []
        for i, filename in enumerate(frames):
            if i > 0 or include_first:  # We don't really want to process the first image unless intended to do so.
                bbox_feed = [
                    current_target_state.bbox.y, current_target_state.bbox.x,
                    current_target_state.bbox.height,
                    current_target_state.bbox.width
                ]
                input_feed = [filename, bbox_feed]

                outputs, metadata = self.siamese_model.inference_step(
                    sess, input_feed)
                search_scale_list = outputs['scale_xs']
                response = outputs['response']
                response_size = response.shape[1]

                # Choose the scale whole response map has the highest peak
                if self.num_scales > 1:
                    response_max = np.max(response, axis=(1, 2))
                    penalties = self.track_config['scale_penalty'] * np.ones(
                        (self.num_scales))
                    current_scale_idx = int(get_center(self.num_scales))
                    penalties[current_scale_idx] = 1.0
                    response_penalized = response_max * penalties
                    best_scale = np.argmax(response_penalized)
                    if np.max(response_max) < 0:
                        logging.warning('MAX_RESPONSE LESS THAN ZERO!')
                        # best_scale = current_scale_idx
                else:
                    best_scale = 0

                response = response[best_scale]

                with np.errstate(
                        all='raise'):  # Raise error if something goes wrong
                    response = response - np.min(response)
                    response = response / np.sum(response)

                if self.window is None:
                    window = np.dot(
                        np.expand_dims(np.hanning(response_size), 1),
                        np.expand_dims(np.hanning(response_size), 0))
                    self.window = window / np.sum(window)  # normalize window
                window_influence = self.track_config['window_influence']
                response = (1 - window_influence
                            ) * response + window_influence * self.window

                # Find maximum response
                r_max, c_max = np.unravel_index(response.argmax(),
                                                response.shape)

                # Convert from crop-relative coordinates to frame coordinates
                p_coor = np.array([r_max, c_max])
                # displacement from the center in instance final representation ...
                disp_instance_final = p_coor - get_center(response_size)
                # ... in instance feature space ...
                upsample_factor = self.track_config['upsample_factor']
                disp_instance_feat = disp_instance_final / upsample_factor
                # ... Avoid empty position ...
                r_radius = int(response_size / upsample_factor / 2)
                disp_instance_feat = np.maximum(
                    np.minimum(disp_instance_feat, r_radius), -r_radius)
                # ... in instance input ...
                disp_instance_input = disp_instance_feat * self.model_config[
                    'embed_config']['stride']
                # ... in instance original crop (in frame coordinates)
                disp_instance_frame = disp_instance_input / search_scale_list[
                    best_scale]
                # Position within frame in frame coordinates
                y = current_target_state.bbox.y
                x = current_target_state.bbox.x
                y += disp_instance_frame[0]
                x += disp_instance_frame[1]

                # Target scale damping and saturation
                target_scale = current_target_state.bbox.height / original_target_height
                search_factor = self.search_factors[best_scale]
                scale_damp = self.track_config[
                    'scale_damp']  # damping factor for scale update
                target_scale *= ((1 - scale_damp) * 1.0 +
                                 scale_damp * search_factor)
                target_scale = np.maximum(0.2, np.minimum(5.0, target_scale))

                # Some book keeping
                height = original_target_height * target_scale
                width = original_target_width * target_scale
                current_target_state.bbox = Rectangle(x, y, width, height)
                current_target_state.scale_idx = best_scale
                current_target_state.search_pos = search_center + disp_instance_input

                assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'
                assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
                  'target position in feature space should be no larger than input image size'

                if self.log_level > 0:
                    np.save(osp.join(logdir, 'num_frames.npy'), [i + 1])

                    # Select the image with the highest score scale and convert it to uint8
                    image_cropped = outputs['image_cropped'][
                        best_scale].astype(np.uint8)
                    # Note that imwrite in cv2 assumes the image is in BGR format.
                    # However, the cropped image returned by TensorFlow is RGB.
                    # Therefore, we convert color format using cv2.cvtColor
                    imwrite(osp.join(logdir, 'image_cropped{}.jpg'.format(i)),
                            cv2.cvtColor(image_cropped, cv2.COLOR_RGB2BGR))

                    np.save(osp.join(logdir, 'best_scale{}.npy'.format(i)),
                            [best_scale])
                    np.save(osp.join(logdir, 'response{}.npy'.format(i)),
                            response)

                    y_search, x_search = current_target_state.search_pos
                    search_scale = search_scale_list[best_scale]
                    target_height_search = height * search_scale
                    target_width_search = width * search_scale
                    bbox_search = Rectangle(x_search, y_search,
                                            target_width_search,
                                            target_height_search)
                    bbox_search = convert_bbox_format(bbox_search,
                                                      'top-left-based')
                    np.save(osp.join(logdir, 'bbox{}.npy'.format(i)), [
                        bbox_search.x, bbox_search.y, bbox_search.width,
                        bbox_search.height
                    ])

            reported_bbox = convert_bbox_format(current_target_state.bbox,
                                                'top-left-based')
            reported_bboxs.append(reported_bbox)
        return reported_bboxs
Beispiel #27
0
def main(checkpoint, input_files):
    os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu()

    model_config, _, track_config = load_cfgs(checkpoint)
    track_config['log_level'] = 1

    g = tf.Graph()
    with g.as_default():
        model = inference_wrapper.InferenceWrapper()
        restore_fn = model.build_graph_from_config(model_config, track_config,
                                                   checkpoint)
    g.finalize()

    if not osp.isdir(track_config['log_dir']):
        logging.info('Creating inference directory: %s',
                     track_config['log_dir'])
        mkdir_p(track_config['log_dir'])

    video_dirs = []
    for file_pattern in input_files.split(","):
        video_dirs.extend(glob(file_pattern))
    logging.info("Running tracking on %d videos matching %s", len(video_dirs),
                 input_files)

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(graph=g, config=sess_config) as sess:
        restore_fn(sess)

        tracker = Tracker(model,
                          model_config=model_config,
                          track_config=track_config)

        for video_dir in video_dirs:
            if not osp.isdir(video_dir):
                logging.warning(
                    '{} is not a directory, skipping...'.format(video_dir))
                continue

            video_name = osp.basename(video_dir)
            video_log_dir = osp.join(track_config['log_dir'], video_name)
            mkdir_p(video_log_dir)

            filenames = sort_nicely(
                glob(video_dir + '/img/*.jpg') +
                glob(video_dir + '/img/*.png'))
            first_line = open(video_dir + '/groundtruth_rect.txt').readline()
            bb = [int(v) for v in first_line.strip().split(',')]
            init_bb = Rectangle(bb[0] - 1, bb[1] - 1, bb[2],
                                bb[3])  # 0-index in python

            trajectory = tracker.track(sess, init_bb, filenames, video_log_dir)
            with open(osp.join(video_log_dir, 'track_rect.txt'), 'w') as f:
                for region in trajectory:
                    rect_str = '{},{},{},{}\n'.format(region.x + 1,
                                                      region.y + 1,
                                                      region.width,
                                                      region.height)
                    f.write(rect_str)

            with open(osp.join(video_log_dir, 'bboxes.json'), 'r') as f:
                data = json.load(f)

            final_output = {}
            for i, fname in enumerate(data.keys()):
                img = np.array(Image.open(fname).convert('RGB'))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                #print(img,img.shape)
                bboxes = data[fname]
                bboxes = list(
                    map(
                        lambda x: list(
                            map(lambda y: float(y),
                                x.strip().split(','))), bboxes))
                arr = []
                for x, y, w, h in bboxes:
                    ymin, xmin, ymax, xmax = int(y), int(x), int(y +
                                                                 h), int(x + w)
                    img = cv2.rectangle(img, (xmin, ymin), (xmax, ymax),
                                        (0, 255, 0, 255), 2)
                    arr.append([ymin, xmin, ymax, xmax])
                final_output[fname] = arr
                name = osp.basename(fname)
                name = osp.splitext(name)[0]

                W, H, _ = img.shape
                cv2.imshow("Pic", cv2.resize(img, (W // 2, H // 2)))
                cv2.waitKey(0)

                out_folder = osp.join(video_log_dir, "Outputs")
                mkdir_p(out_folder)
                cv2.imwrite(osp.join(out_folder, f"{name}_bbox.png"), img)

            with open(osp.join(out_folder, "output.json"), "w") as f:
                json.dump(final_output, f, indent=4)

            cv2.destroyAllWindows()