Esempio n. 1
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] +
                                 '_step_{:.4f}'.format(hp['scale_step']) +
                                 '_penalty_s_{:.4f}'.format(hp['scale_penalty']) +
                                 '_w_influence_{:.4f}'.format(hp['w_influence']) +
                                 '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace('.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        return tracker_path

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net, hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(location)

    with open(result_path, "w") as fin:
        for x in regions:
            p_bbox = x.copy()
            fin.write(
                ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')

    return tracker_path
Esempio n. 2
0
    def track(self, image_file):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)  # align with training

        self.state = self.siam_tracker.track(self.state, im)
        location = cxy_wh_2_rect(self.state["target_pos"], self.state["target_sz"])
        return location
Esempio n. 3
0
def track_video(tracker, model, video_path, init_box=None):

    assert os.path.isfile(video_path), "please provide a valid video file"

    cap = cv2.VideoCapture(video_path)
    display_name = 'Video: {}'.format(video_path.split('/')[-1])
    cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
    cv2.resizeWindow(display_name, 960, 720)
    success, frame = cap.read()
    #cv2.imshow(display_name, frame)

    if success is not True:
        print("Read failed.")
        exit(-1)

    # init
    if init_box is not None:
        lx, ly, w, h = init_box
        target_pos = np.array([lx + w / 2, ly + h / 2])
        target_sz = np.array([w, h])
        state = tracker.init(frame, target_pos, target_sz,
                             model)  # init tracker

    else:
        while True:

            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                        (0, 0, 255), 1)

            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz,
                                 model)  # init tracker

            break

    while True:
        ret, frame = cap.read()

        if frame is None:
            return

        frame_disp = frame.copy()

        # Draw box
        state = tracker.track(state, frame_disp)  # track
        location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        x1, y1, x2, y2 = int(location[0]), int(
            location[1]), int(location[0] + location[2]), int(location[1] +
                                                              location[3])

        cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5)

        font_color = (0, 0, 0)
        cv2.putText(frame_disp, 'Tracking!', (20, 30),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press r to reset', (20, 55),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press q to quit', (20, 80),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)

        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
        elif key == ord('r'):
            ret, frame = cap.read()
            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
                        (0, 0, 0), 1)

            cv2.imshow(display_name, frame_disp)
            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz, model)

    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
Esempio n. 4
0
def track_images(tracker, model, images_path, init_box=None):

    assert os.path.isdir(images_path), "please provide a valid folder name"

    display_name = 'Video: {}'.format(images_path.split('/')[-1])
    cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
    cv2.resizeWindow(display_name, 960, 720)

    im_paths = [(images_path + '/' + f) for f in os.listdir(images_path)
                if '.jpg' in f]
    if len(im_paths) == 0:
        print("no jpg images found in dir")
        exit(-1)

    frame = cv2.imread(im_paths[0])
    #cv2.imshow(im_paths[0].split('/')[-1], frame)

    # init
    if init_box is not None:
        lx, ly, w, h = init_box
        target_pos = np.array([lx + w / 2, ly + h / 2])
        target_sz = np.array([w, h])
        state = tracker.init(frame, target_pos, target_sz,
                             model)  # init tracker

    else:
        while True:

            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                        (0, 0, 255), 1)

            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz,
                                 model)  # init tracker

            break

    path_idx = 0
    while path_idx < len(im_paths):
        time.sleep(2)
        path_idx += 1
        frame = cv2.imread(im_paths[path_idx])

        if frame is None:
            return

        frame_disp = frame.copy()

        # Draw box
        state = tracker.track(state, frame_disp)  # track
        location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        x1, y1, x2, y2 = int(location[0]), int(
            location[1]), int(location[0] + location[2]), int(location[1] +
                                                              location[3])

        cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5)

        font_color = (0, 0, 0)
        cv2.putText(frame_disp, 'Tracking!', (20, 30),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press r to reset', (20, 55),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press q to quit', (20, 80),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)

        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
        elif key == ord('r'):
            path_idx += 1
            frame = cv2.imread(im_paths[path_idx])
            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
                        (0, 0, 0), 1)

            cv2.imshow(display_name, frame_disp)
            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz, model)

    # When everything done, release the capture
    cv2.destroyAllWindows()
Esempio n. 5
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test',
                        (benchmark_name + resume.split('/')[-1].split('.')[0] +
                         '_step_{:.4f}'.format(hp['scale_step']) +
                         '_penalty_s_{:.4f}'.format(hp['scale_penalty']) +
                         '_w_influence_{:.4f}'.format(hp['w_influence']) +
                         '_scale_lr_{:.4f}'.format(hp['scale_lr'])).replace(
                             '.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    elif 'GOT10K' in benchmark_name:
        re_video_path = os.path.join(tracker_path, video['name'])
        if not exists(re_video_path): os.makedirs(re_video_path)
        result_path = os.path.join(re_video_path,
                                   '{:s}.txt'.format(video['name']))
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

        # occ for parallel running
        if not os.path.exists(result_path):
            fin = open(result_path, 'w')
            fin.close()
        else:
            if benchmark_name.startswith('OTB'):
                return tracker_path
            elif benchmark_name.startswith('VOT') or benchmark_name.startswith(
                    'GOT10K'):
                return 0
            else:
                print('benchmark not supported now')
                return

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net,
                                 hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in benchmark_name else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

        # save results for OTB
        if 'OTB' in benchmark_name or 'LASOT' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    p_bbox = x.copy()
                    fin.write(','.join([
                        str(i + 1) if idx == 0 or idx == 1 else str(i)
                        for idx, i in enumerate(p_bbox)
                    ]) + '\n')
        elif 'VISDRONE' in benchmark_name or 'GOT10K' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    p_bbox = x.copy()
                    fin.write(
                        ','.join([str(i)
                                  for idx, i in enumerate(p_bbox)]) + '\n')
        elif 'VOT' in benchmark_name:
            with open(result_path, "w") as fin:
                for x in regions:
                    if isinstance(x, int):
                        fin.write("{:d}\n".format(x))
                    else:
                        p_bbox = x.copy()
                        fin.write(','.join([str(i) for i in p_bbox]) + '\n')

        if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name:
            return tracker_path
        else:
            print('benchmark not supported now')
Esempio n. 6
0
def track(siam_tracker, online_tracker, siam_net, video, args):
    start_frame, toc = 0, 0

    # save result to evaluate
    if args.epoch_test:
        suffix = args.resume.split('/')[-1]
        suffix = suffix.split('.')[0]
        tracker_path = os.path.join('result', args.dataset, args.arch + suffix)
    else:
        tracker_path = os.path.join('result', args.dataset, args.arch)

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in args.dataset:
        baseline_path = os.path.join(tracker_path, 'baseline')
        video_path = os.path.join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = os.path.join(video_path, video['name'] + '_001.txt')
    else:
        result_path = os.path.join(tracker_path,
                                   '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return  # for mult-gputesting

    regions = []
    lost = 0

    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):

        im = cv2.imread(image_file)
        rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)  # align with training

        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])

            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])

            state = siam_tracker.init(im, target_pos, target_sz,
                                      siam_net)  # init tracker

            if args.online:
                online_tracker.init(im,
                                    rgb_im,
                                    siam_net,
                                    target_pos,
                                    target_sz,
                                    True,
                                    dataname=args.dataset,
                                    resume=args.resume)

            # location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            if args.online:
                state = online_tracker.track(im, rgb_im, siam_tracker, state)
            else:
                state = siam_tracker.track(state, im)

            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in args.dataset else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                start_frame = f + 5
                lost += 1
        else:
            regions.append(0)

        toc += cv2.getTickCount() - tic

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        elif 'OTB' in args.dataset or 'LASOT' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([
                    str(i + 1) if idx == 0 or idx == 1 else str(i)
                    for idx, i in enumerate(p_bbox)
                ]) + '\n')
        elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset or 'TN' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i)
                                    for idx, i in enumerate(p_bbox)]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps  Lost {}'.format(
        video['name'], toc, f / toc, lost))
Esempio n. 7
0
def track(video, args):
    start_frame, toc = 0, 0

    # save result to evaluate
    tracker_path = os.path.join('result', args.dataset)

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in args.dataset:
        baseline_path = os.path.join(tracker_path, 'baseline')
        video_path = os.path.join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = os.path.join(video_path, video['name'] + '_001.txt')
    else:
        result_path = os.path.join(tracker_path,
                                   '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return  # for mult-gputesting

    regions = []
    lost = 0
    # for rgbt splited test

    in_image_files, rgb_image_files, gt = video['infrared_imgs'], video[
        'visiable_imgs'], video['gt']

    for f, in_f in enumerate(in_image_files):

        in_im = cv2.imread(in_f)
        in_im = cv2.cvtColor(in_im, cv2.COLOR_BGR2RGB)  # align with training

        rgb_im = cv2.imread(rgb_image_files[f])
        rgb_im = cv2.cvtColor(rgb_im, cv2.COLOR_BGR2RGB)  # align with training

        tic = cv2.getTickCount()

        if f == start_frame:  # init
            print('===============> init tracker')
            tracker = AASTracker(rgb_im, in_im, gt[f])

            regions.append(1)
        elif f > start_frame:  # tracking
            state = tracker.track(rgb_im, in_im)

            pos = np.array([state[0], state[1]])
            sz = np.array([state[2], state[3]])

            location = cxy_wh_2_rect(pos, sz)
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in args.dataset else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                start_frame = f + 5
                lost += 1
        else:
            regions.append(0)

        toc += cv2.getTickCount() - tic

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps  Lost {}'.format(
        video['name'], toc, f / toc, lost))
Esempio n. 8
0
def track_video(tracker, model, video_path, init_box=None):

    cap = cv2.VideoCapture(video_path)
    display_name = 'Video: {}'.format(video_path.split('/')[-1])
    cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
    cv2.resizeWindow(display_name, 960, 720)
    success, frame = cap.read()
    cv2.imshow(display_name, frame)

    if success is not True:
        print("Read failed.")
        exit(-1)

    # init
    if init_box is not None:
        lx, ly, w, h = init_box
        f.write(lx + "\t" + ly + "\t" + w + "\t" + h + "\n")
        target_pos = np.array([lx + w / 2, ly + h / 2])
        target_sz = np.array([w, h])
        state = tracker.init(frame, target_pos, target_sz,
                             model)  # init tracker

    else:
        while True:

            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                        (0, 0, 255), 1)

            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            f.write(lx + "\t" + ly + "\t" + w + "\t" + h + "\n")
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz,
                                 model)  # init tracker

            break
    i = 0
    restarts = 0
    assert os.path.exists(
        './bbox'
    ), "Please create a directory called bbox to store the values of the bounding boxes"
    f = open('./bbox/SiamRPN.txt', '+a')
    box_color = (0, 255, 0)
    while True:
        ret, frame = cap.read()

        if frame is None:
            return

        frame_disp = frame.copy()

        timer = cv2.getTickCount()
        # Draw box
        state = tracker.track(state, frame_disp)  # track

        fps = cv2.getTickFrequency() / (cv2.getTickCount() - timer)
        location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        x1, y1, x2, y2 = int(location[0]), int(
            location[1]), int(location[0] + location[2]), int(location[1] +
                                                              location[3])
        font_color = (0, 0, 255)
        cv2.rectangle(frame_disp, (x1, y1), (x2, y2), box_color, 5)
        cv2.putText(frame_disp, 'Restarts: ' + str(restarts), (20, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2)
        cv2.putText(frame_disp, 'Frame Number: ' + '{:04d}'.format(i),
                    (20, 55), cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2)
        cv2.putText(frame_disp, 'FPS: ' + str(fps), (20, 80),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, font_color, 2)
        frame_name = '{:04d}'.format(i)
        assert os.path.exists(
            "./output"), "Please create a directory to store the output files"
        cv2.imwrite("./output/" + str(frame_name) + ".png", frame_disp)

        x = x1
        y = y1
        w = x2 - x1
        h = y2 - y1
        bbox = [x, y, w, h]
        box = [str(i) for i in bbox]
        f.write(box[0] + "\t" + box[1] + "\t" + box[2] + "\t" + box[3] + "\n")

        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
        elif key == ord('r'):
            restarts = restarts + 1
            if restarts % 2 == 1:
                box_color = (255, 0, 0)
            else:
                box_color = (0, 255, 0)

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
                        (0, 0, 0), 1)

            cv2.imshow(display_name, frame_disp)
            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz, model)
        i = i + 1
    # When everything done, release the capture
    os.rename('./bbox/SiamRPN.txt', './bbox/SiamRPN_' + str(restarts) + '.txt')
    f.close()
    cap.release()
    cv2.destroyAllWindows()
Esempio n. 9
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config[
        'hp']  # penalty_k, scale_lr, window_influence, adaptive size (for vot2017 or later)

    tracker_path = join('test',
                        (benchmark_name + resume.split('/')[-1].split('.')[0] +
                         '_small_size_{:.4f}'.format(hp['small_sz']) +
                         '_big_size_{:.4f}'.format(hp['big_sz']) +
                         '_penalty_k_{:.4f}'.format(hp['penalty_k']) +
                         '_w_influence_{:.4f}'.format(hp['window_influence']) +
                         '_scale_lr_{:.4f}'.format(hp['lr'])).replace(
                             '.', '_'))  # no .

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    else:
        raise ValueError('Only VOT is supported')

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        if benchmark_name.startswith('VOT'):
            return 0
        else:
            raise ValueError('Only VOT is supported')

    start_frame, lost_times, toc = 0, 0, 0
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz, net,
                                 hp=hp)  # init tracker
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in benchmark_name else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

    # save results for OTB
    if benchmark_name.startswith('VOT'):
        return regions
    else:
        raise ValueError('Only VOT is supported')
Esempio n. 10
0
def main():
    # load config
    cfg_from_file(args.config)

    dataset_root = os.path.join('dataset', args.dataset)

    # create model
    net = ModelBuilder()
    checkpoint = torch.load(args.model)
    if 'state_dict' in checkpoint:
        net.load_state_dict(checkpoint['state_dict'])
    else:
        net.load_state_dict(checkpoint)
    net.cuda().eval()
    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.save_name
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    target_pos, target_sz = np.array([cx,
                                                      cy]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_polygon = [
                        pred_bbox[0], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2],
                        pred_bbox[1] + pred_bbox[3], pred_bbox[0],
                        pred_bbox[1] + pred_bbox[3]
                    ]
                    overlap = vot_overlap(gt_bbox, pred_polygon,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('result', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    if 'OTB' in args.dataset:
                        target_pos, target_sz = rect1_2_cxy_wh(gt_bbox)
                    else:
                        cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                        target_pos, target_sz = np.array([cx, cy
                                                          ]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    if 'OTB' in args.dataset:
                        pred_bbox = cxy_wh_2_rect1(state['target_pos'],
                                                   state['target_sz'])
                    else:
                        pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                                  state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                else:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            if 'GOT-10k' == args.dataset:
                video_path = os.path.join('result', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Esempio n. 11
0
def track_tune(tracker, net, video, config):
    arch = config['arch']
    benchmark_name = config['benchmark']
    resume = config['resume']
    hp = config['hp']  # scale_step, scale_penalty, scale_lr, window_influence

    tracker_path = join('test', (benchmark_name + resume.split('/')[-1].split('.')[0] +
                                     '_small_size_{:.4f}'.format(hp['small_sz']) +
                                     '_big_size_{:.4f}'.format(hp['big_sz']) +
                                     '_lambda_u_{:.4f}'.format(hp['choose_thr']) +
                                     '_lambda_s_{:.4f}'.format(hp['choose_thr']) +
                                     '_cyclic_thr_{:.4f}'.format(hp['choose_thr']) +
                                     '_choose_thr_{:.4f}'.format(hp['choose_thr']) +
                                     '_penalty_k_{:.4f}'.format(hp['penalty_k']) +
                                     '_w_influence_{:.4f}'.format(hp['window_influence']) +
                                     '_scale_lr_{:.4f}'.format(hp['lr'])).replace('.', '_'))  # no .
    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in benchmark_name:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    elif 'GOT10K' in benchmark_name:
        re_video_path = os.path.join(tracker_path, video['name'])
        if not exists(re_video_path): os.makedirs(re_video_path)
        result_path = os.path.join(re_video_path, '{:s}.txt'.format(video['name']))
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    # occ for parallel running
    if not os.path.exists(result_path):
        fin = open(result_path, 'w')
        fin.close()
    else:
        if benchmark_name.startswith('OTB'):
            return tracker_path
        elif benchmark_name.startswith('VOT') or benchmark_name.startswith('GOT10K'):
            return 0
        else:
            print('benchmark not supported now')
            return

    start_frame, lost_times, toc = 0, 0, 0

    regions = []  # result and states[1 init / 2 lost / 0 skip]

    # for rgbt splited test

    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            mask_gt = None

            state = tracker.init(im, target_pos, target_sz, net, online=False, mask=mask_gt, debug=False, hp=hp)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append([float(1)] if 'VOT' in benchmark_name else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f], location) if 'VOT' in benchmark_name else 1

            polygon = state['polygon']
            if not polygon is None:
                polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1],
                           polygon[3][0], polygon[3][1]]
                polygon = np.array(polygon)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])

            if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']:
                record = polygon
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])
                record = polygon

            if not 'VOT' in benchmark_name:  # change polygon to [x, y, w, h]
                x1, y1, x2, y2 = record[0], record[1], record[4], record[5]
                record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1])

            if b_overlap > 0:
                regions.append(record)
            else:
                regions.append([float(2)])
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append([float(0)])

    # save results for OTB
    if 'OTB' in benchmark_name or 'LASOT' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                p_bbox = x.copy()
                fin.write(
                    ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')
    elif 'VISDRONE' in benchmark_name  or 'GOT10K' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n')
    elif 'VOT' in benchmark_name:
        with open(result_path, "w") as fin:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')

    if 'OTB' in benchmark_name or 'VIS' in benchmark_name or 'VOT' in benchmark_name or 'GOT10K' in benchmark_name:
        return tracker_path
    else:
        print('benchmark not supported now')
Esempio n. 12
0
def track_box(siam_tracker, online_tracker, siam_net, video, args):
    """
    track a benchmark with only box annoated
    attention: not for benchmark evaluation, just a demo
    """

    tracker_path = os.path.join('result', args.dataset, args.arch)

    if 'VOT' in args.dataset:
        baseline_path = os.path.join(tracker_path, 'baseline')
        video_path = os.path.join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = os.path.join(video_path, video['name'] + '_001.txt')
    else:
        result_path = os.path.join(tracker_path, '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return  # for mult-gputesting

    regions = []
    b_overlaps, b_overlaps2, b_overlaps3 = [], [], []
    lost = 0
    start_frame, toc = 0, 0
    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if args.online:
            rgb_im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        if len(im.shape) == 2: im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)   # align with training

        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            mask_gt = None

            state = siam_tracker.init(im, target_pos, target_sz, siam_net, online=args.online, mask=mask_gt, debug=args.debug)  # init siamese tracker

            if args.online:
                online_tracker.init(im, rgb_im, siam_net, target_pos, target_sz, True, dataname=args.dataset, resume=args.resume)

        elif f > start_frame:  # tracking
            if args.online:
                state = online_tracker.track(im, rgb_im, siam_tracker, state)
            else:
                state = siam_tracker.track(state, im, name=image_file)

            mask = state['mask']

            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f], location) if 'VOT' in args.dataset else 1
            polygon = state['polygon']

            if not polygon is None:
                polygon = [polygon[0][0], polygon[0][1], polygon[1][0], polygon[1][1], polygon[2][0], polygon[2][1], polygon[3][0], polygon[3][1]]
                polygon = np.array(polygon)
                # b_overlap2 = poly_iou(gt[f], polygon)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])

            if poly_iou(np.array(location), np.array(polygon)) > state['choose_thr']:
                record = polygon
                # b_overlaps3.append(b_overlap2)
            else:
                x1, y1, w, h = location
                x2, y2 = x1 + w, y1 + h
                polygon = np.array([x1, y1, x2, y1, x2, y2, x1, y2])
                record = polygon
                # b_overlaps3.append(b_overlap)

            # print('b_overlap: {}, b_overlap2: {}'.format(b_overlap, b_overlap2))
            # b_overlaps.append(b_overlap)
            # b_overlaps2.append(b_overlap2)

            if not 'VOT' in benchmark_name:  # change polygon to [x, y, w, h]
                x1, y1, x2, y2 = record[0], record[1], record[4], record[5]
                record = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1])

            if b_overlap > 0:
                regions.append(record)
            else:
                regions.append(2)
                start_frame = f + 5
                lost += 1

            if args.vis:
                COLORS = np.random.randint(128, 255, size=(1, 3), dtype="uint8")
                COLORS = np.vstack([[0, 0, 0], COLORS]).astype("uint8")
                mask = COLORS[mask]
                output = ((0.4 * im) + (0.6 * mask)).astype("uint8")
                cv2.imshow("mask", output)
                cv2.waitKey(1)

        toc += cv2.getTickCount() - tic

    # print('b_overlap: {}, b_overlap2: {}, b_overlap3: {}'.format(np.array(b_overlaps).mean(), np.array(b_overlaps2).mean(), np.array(b_overlaps3).mean()))

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        elif 'OTB' in args.dataset or 'LASOT' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(
                    ','.join([str(i + 1) if idx == 0 or idx == 1 else str(i) for idx, i in enumerate(p_bbox)]) + '\n')
        elif 'VISDRONE' in args.dataset or 'GOT10K' in args.dataset:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([str(i) for idx, i in enumerate(p_bbox)]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(video['name'], toc, f / toc))
Esempio n. 13
0
def track_video(tracker, model, video_path, init_box=None):

    files = os.listdir(video_path)
    files = sorted(files)
    init_image_file = os.path.join(video_path, files[0])
    tracking_image_files = [os.path.join(video_path, f) for f in files[1:]]

    display_name = 'demo'
    frame = cv2.imread(init_image_file)
    cv2.imshow(display_name, frame)

    # init
    if init_box is not None:
        lx, ly, w, h = init_box
        target_pos = np.array([lx + w / 2, ly + h / 2])
        target_sz = np.array([w, h])
        state = tracker.init(frame, target_pos, target_sz,
                             model)  # init tracker

    else:
        while True:

            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                        (0, 0, 255), 1)

            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz,
                                 model)  # init tracker

            break

    for file in tracking_image_files:
        frame = cv2.imread(file)

        frame_disp = frame.copy()

        # Draw box
        state = tracker.track(state, frame_disp)  # track
        location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        x1, y1, x2, y2 = int(location[0]), int(
            location[1]), int(location[0] + location[2]), int(location[1] +
                                                              location[3])

        cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5)

        font_color = (0, 0, 0)
        cv2.putText(frame_disp, 'Tracking!', (20, 30),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press r to reset', (20, 55),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press q to quit', (20, 80),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)

        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)
        key = cv2.waitKey(5)
        if key == ord('q'):
            break
        elif key == ord('r'):
            ret, frame = cap.read()
            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
                        (0, 0, 0), 1)

            cv2.imshow(display_name, frame_disp)
            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz, model)

    # When everything done, release the capture
    cv2.destroyAllWindows()
Esempio n. 14
0
def track_webcam(tracker, model, updatenet):
    """Run tracker with webcam."""

    class UIControl:
        def __init__(self):
            self.mode = 'init'  # init, select, track
            self.target_tl = (-1, -1)
            self.target_br = (-1, -1)
            self.mode_switch = False

        def mouse_callback(self, event, x, y, flags, param):
            if event == cv2.EVENT_LBUTTONDOWN and self.mode == 'init':
                self.target_tl = (x, y)
                self.target_br = (x, y)
                self.mode = 'select'
                self.mode_switch = True
            elif event == cv2.EVENT_MOUSEMOVE and self.mode == 'select':
                self.target_br = (x, y)
            elif event == cv2.EVENT_LBUTTONDOWN and self.mode == 'select':
                self.target_br = (x, y)
                self.mode = 'track'
                self.mode_switch = True

        def get_tl(self):
            return self.target_tl if self.target_tl[0] < self.target_br[0] else self.target_br

        def get_br(self):
            return self.target_br if self.target_tl[0] < self.target_br[0] else self.target_tl

        def get_bb(self):
            tl = self.get_tl()
            br = self.get_br()

            bb = [min(tl[0], br[0]), min(tl[1], br[1]), abs(br[0] - tl[0]), abs(br[1] - tl[1])]
            return bb   # [lx, ly, w, h]

    ui_control = UIControl()
    cap = cv2.VideoCapture(0)
    display_name = 'SiamDW on webcam'
    cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
    cv2.resizeWindow(display_name, 960, 720)
    cv2.setMouseCallback(display_name, ui_control.mouse_callback)


    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        frame_disp = frame.copy()

        if ui_control.mode == 'track' and ui_control.mode_switch:
            ui_control.mode_switch = False
            lx, ly, w, h = ui_control.get_bb()
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])
            state = tracker.init(frame_disp, target_pos, target_sz, model)  # init tracker


        # Draw box
        if ui_control.mode == 'select':
            cv2.rectangle(frame_disp, ui_control.get_tl(), ui_control.get_br(), (255, 0, 0), 2)
        elif ui_control.mode == 'track':
            state = tracker.track(state, frame_disp, updatenet)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            x1, y1, x2, y2 = int(location[0]), int(location[1]), int(location[0] + location[2]), int(
                location[1] + location[3])

            cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5)

        # Put text
        font_color = (0, 0, 0)
        if ui_control.mode == 'init' or ui_control.mode == 'select':
            cv2.putText(frame_disp, 'Select target', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
            cv2.putText(frame_disp, 'Press q to quit', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                       font_color, 1)
        elif ui_control.mode == 'track':
            cv2.putText(frame_disp, 'Tracking!', (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                       font_color, 1)
            cv2.putText(frame_disp, 'Press r to reset', (20, 55), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                       font_color, 1)
            cv2.putText(frame_disp, 'Press q to quit', (20, 80), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                       font_color, 1)
        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
        elif key == ord('r'):
            ui_control.mode = 'init'

    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
Esempio n. 15
0
def main():
    net = models.__dict__[args.arch](anchors_nums=args.anchor_nums,
                                     cls_type=args.cls_type)
    net = load_pretrain(net, args.resume)
    net.eval()
    net = net.cuda()

    # prepare tracker
    info = edict()
    info.arch = args.arch
    info.cls_type = args.cls_type
    info.dataset = args.dataset
    info.epoch_test = args.epoch_test
    tracker = SiamRPN(info)

    dataset_root = os.path.join("/ssd", args.dataset)
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    model_name = args.resume.split('/')[-1].split('.')[0]
    total_lost = 0
    """
    eao will lower than origin version(0.393->0.390) due to the  
    number of digits after the decimal point
    """
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                # if len(gt_bbox) == 4:
                #     gt_bbox = [gt_bbox[0], gt_bbox[1],
                #        gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)
                    #pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    #outputs = tracker.track(img)
                    pred_bbox = location
                    #overlap=poly_iou(gt_bbox,location)
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)

                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    pred_bbox = location

                    #outputs = tracker.track(img)
                    #pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(state['score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Esempio n. 16
0
def track_video(siam_tracker,
                online_tracker,
                siam_net,
                video_path,
                init_box=None,
                args=None):

    assert os.path.isfile(video_path), "please provide a valid video file"

    video_name = video_path.split('/')[-1]
    video_name = video_name.split('.')[0]
    save_path = os.path.join('vis', video_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    cap = cv2.VideoCapture(video_path)
    display_name = 'Video: {}'.format(video_path.split('/')[-1])
    cv2.namedWindow(display_name, cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
    cv2.resizeWindow(display_name, 960, 720)
    success, frame = cap.read()
    cv2.imshow(display_name, frame)

    if success is not True:
        print("Read failed.")
        exit(-1)

    # init
    count = 0

    if init_box is not None:
        lx, ly, w, h = init_box
        target_pos = np.array([lx + w / 2, ly + h / 2])
        target_sz = np.array([w, h])

        state = siam_tracker.init(frame, target_pos, target_sz,
                                  siam_net)  # init tracker
        rgb_im = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        if args.online:
            online_tracker.init(frame,
                                rgb_im,
                                siam_net,
                                target_pos,
                                target_sz,
                                True,
                                dataname='VOT2019',
                                resume=args.resume)

    else:
        while True:

            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1,
                        (0, 0, 255), 1)

            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])

            state = siam_tracker.init(frame_disp, target_pos, target_sz,
                                      siam_net)  # init tracker
            rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB)

            if args.online:
                online_tracker.init(frame_disp,
                                    rgb_im,
                                    siam_net,
                                    target_pos,
                                    target_sz,
                                    True,
                                    dataname='VOT2019',
                                    resume=args.resume)

            break

    while True:
        ret, frame = cap.read()

        if frame is None:
            return

        frame_disp = frame.copy()
        rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB)

        # Draw box
        if args.online:
            state = online_tracker.track(frame_disp, rgb_im, siam_tracker,
                                         state)
        else:
            state = siam_tracker.track(state, frame_disp)

        location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        x1, y1, x2, y2 = int(location[0]), int(
            location[1]), int(location[0] + location[2]), int(location[1] +
                                                              location[3])

        cv2.rectangle(frame_disp, (x1, y1), (x2, y2), (0, 255, 0), 5)

        font_color = (0, 0, 0)
        cv2.putText(frame_disp, 'Tracking!', (20, 30),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press r to reset', (20, 55),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
        cv2.putText(frame_disp, 'Press q to quit', (20, 80),
                    cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)

        # Display the resulting frame
        cv2.imshow(display_name, frame_disp)

        if args.save:
            save_name = os.path.join(save_path, '{:04d}.jpg'.format(count))
            cv2.imwrite(save_name, frame_disp)
            count += 1

        key = cv2.waitKey(1)
        # key = None
        if key == ord('q'):
            break
        elif key == ord('r'):
            ret, frame = cap.read()
            frame_disp = frame.copy()

            cv2.putText(frame_disp, 'Select target ROI and press ENTER',
                        (20, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
                        (0, 0, 0), 1)

            cv2.imshow(display_name, frame_disp)
            lx, ly, w, h = cv2.selectROI(display_name,
                                         frame_disp,
                                         fromCenter=False)
            target_pos = np.array([lx + w / 2, ly + h / 2])
            target_sz = np.array([w, h])

            state = siam_tracker.init(frame_disp, target_pos, target_sz,
                                      siam_net)  # init tracker
            rgb_im = cv2.cvtColor(frame_disp, cv2.COLOR_BGR2RGB)

            if args.online:
                online_tracker.init(frame_disp,
                                    rgb_im,
                                    siam_net,
                                    target_pos,
                                    target_sz,
                                    True,
                                    dataname='VOT2019',
                                    resume=args.resume)

    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()
Esempio n. 17
0
if mask_vot:
    print('the input is a binary mask')
    selection = handle.region()
    mask = make_full_size(selection, (im.shape[1], im.shape[0]))
    bbox = rect_from_mask(mask)  # [cx,cy,w,h] TODO: use cv.minmaxRect here
    cx, cy, w, h = bbox
else:
    print('the input is a rect box')
    selection = handle.region()  # selection in ncc_mask
    lx, ly, w, h = selection.x, selection.y, selection.width, selection.height
    cx, cy = lx + w / 2, ly + h / 2

target_pos = np.array([cx, cy])
target_sz = np.array([w, h])
state = tracker.init(im, target_pos, target_sz, net, mask=mask)

count = 0
while True:
    image_file = handle.frame()
    if not image_file:
        break
    im = cv2.imread(image_file)  # HxWxC
    state = tracker.track(state, im)
    mask = state['mask']
    if mask is None or mask.sum() < 10:
        rect = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
        mask = mask_from_rect(rect, (im.shape[1], im.shape[0]))
    handle.report(mask, state['cls_score'])
    count += 1
Esempio n. 18
0
def track(tracker, net, video, args):
    start_frame, lost_times, toc = 0, 0, 0

    # save result to evaluate
    if args.epoch_test:
        suffix = args.resume.split('/')[-1]
        suffix = suffix.split('.')[0]
        tracker_path = os.path.join('result', args.dataset, args.arch + suffix)
    else:
        tracker_path = os.path.join('result', args.dataset, args.arch)

    if not os.path.exists(tracker_path):
        os.makedirs(tracker_path)

    if 'VOT' in args.dataset:
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, video['name'])
        if not os.path.exists(video_path):
            os.makedirs(video_path)
        result_path = join(video_path, video['name'] + '_001.txt')
    else:
        result_path = join(tracker_path, '{:s}.txt'.format(video['name']))

    if os.path.exists(result_path):
        return 0  # for mult-gputesting

    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)

        tic = cv2.getTickCount()

        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = tracker.init(im, target_pos, target_sz,
                                 net)  # init tracker
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            state = tracker.track(state, im)  # track
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = poly_iou(gt[f],
                                 location) if 'VOT' in args.dataset else 1
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic
    toc /= cv2.getTickFrequency()

    with open(result_path, "w") as fin:
        if 'VOT' in args.dataset:
            for x in regions:
                if isinstance(x, int):
                    fin.write("{:d}\n".format(x))
                else:
                    p_bbox = x.copy()
                    fin.write(','.join([str(i) for i in p_bbox]) + '\n')
        else:
            for x in regions:
                p_bbox = x.copy()
                fin.write(','.join([
                    str(i + 1) if idx == 0 or idx == 1 else str(i)
                    for idx, i in enumerate(p_bbox)
                ]) + '\n')

    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
        video['name'], toc, f / toc, lost_times))

    return lost_times
Esempio n. 19
0
def track_video(model, video):
    start_frame, toc = 0, 0

    # vis or save OTB result to evaluate
    if not args.vis:
        tracker_path = os.path.join(
            'test', args.dataset,
            args.arch.split('.')[0] + args.resume.split('/')[-1].split('.')[0])

        if not os.path.exists(tracker_path):
            os.makedirs(tracker_path)

        if 'VOT' in args.dataset:
            baseline_path = os.path.join(tracker_path, 'baseline')
            video_path = os.path.join(baseline_path, video['name'])
            if not os.path.exists(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, video['name'] + '_001.txt')
        else:
            result_path = os.path.join(tracker_path,
                                       '{:s}.txt'.format(video['name']))

        if not os.path.exists(result_path):  # for multi-gpu test
            fin = open(result_path, "w")
            fin.close()
        else:
            return

    regions = []
    image_files, gt = video['image_files'], video['gt']
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        if len(im.shape) == 2:
            im = cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)

        tic = cv2.getTickCount()

        if f == start_frame:  # init
            cx, cy, w, h = get_min_max_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = SiamFC_init(im, target_pos, target_sz,
                                model)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            state = SiamFC_track(state, im)
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            b_overlap = judge_overlap(gt[f],
                                      location) if 'VOT' in args.dataset else 1
            if b_overlap:
                regions.append(location)
            else:
                regions.append(2)
                start_frame = f + 5
        else:
            regions.append(0)

        toc += cv2.getTickCount() - tic

    if bool(args.vis) and f >= start_frame:  # visualization (skip lost frame)
        if f == 0:
            cv2.destroyAllWindows()
            cv2.rectangle(im, (int(gt[f, 0]), int(gt[f, 1])),
                          (int(gt[f, 0] + gt[f, 2]), int(gt[f, 1] + gt[f, 3])),
                          (0, 255, 0), 3)
        else:
            location = [int(l) for l in location]  #
            cv2.rectangle(
                im, (location[0], location[1]),
                (location[0] + location[2], location[1] + location[3]),
                (0, 255, 255), 3)
        cv2.putText(im, '#' + str(f), (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                    (0, 255, 255), 2)

        cv2.imshow(video['name'], im)
        cv2.waitKey(1)

    else:
        with open(result_path, "w") as fin:
            if 'VOT' in args.dataset:
                for x in regions:
                    if isinstance(x, int):
                        fin.write("{:d}\n".format(x))
                    else:
                        p_bbox = x.copy()
                        if p_bbox[0] < 0: p_bbox[0] = 0
                        if p_bbox[1] < 0: p_bbox[1] = 0
                        fin.write(','.join([str(i) for i in p_bbox]) + '\n')
            else:
                for x in regions:
                    p_bbox = x.copy()
                    if p_bbox[0] < 0: p_bbox[0] = 1
                    if p_bbox[1] < 0: p_bbox[1] = 1
                    fin.write(','.join([
                        str(i + 1) if idx == 0 or idx == 1 else str(i)
                        for idx, i in enumerate(p_bbox)
                    ]) + '\n')

    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(
        video['name'], toc, f / toc))