Beispiel #1
0
def stoa_track(idx,
               frame_counter,
               img,
               gt_bbox,
               tracker1,
               template_dir=None,
               img_names=None):
    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
    lost_number = 0

    if idx == frame_counter:
        init_gt = gt_bbox_
        if template_dir is not None:
            img = cv2.imread(template_dir)
        tracker1.init(img, gt_bbox_)
        # pred_bboxes.append(1)
        if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
            append = 1
        else:
            append = gt_bbox_

    elif idx > frame_counter:

        if img_names is not None:
            img = cv2.imread(img_names[idx])
        outputs = tracker1.track(img, idx=idx)

        # print('****************** state of the art tracking ******************')
        append = outputs['bbox']

        overlap = vot_overlap(outputs['bbox'], gt_bbox,
                              (img.shape[1], img.shape[0]))
        if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
            if overlap > 0:
                # not lost
                lost = False
            else:
                # lost object
                append = 2
                frame_counter = idx + 5  # skip 5 frames
                lost_number = 1
                lost = True
        else:
            if overlap <= 0:
                lost_number = 1

    else:
        append = 0

    return append, lost_number, frame_counter
Beispiel #2
0
def stoa_track(idx, frame_counter, img, gt_bbox, tracker1):
    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
    lost_number = 0

    if idx == frame_counter:
        init_gt = gt_bbox_
        tracker1.init(img, gt_bbox_)
        # pred_bboxes.append(1)
        append = 1
    elif idx > frame_counter:

        outputs = tracker1.track(img, idx=idx)

        # print('****************** state of the art tracking ******************')
        append = outputs['bbox']

        overlap = vot_overlap(outputs['bbox'], gt_bbox,
                              (img.shape[1], img.shape[0]))
        if args.dataset != 'OTB100':
            if overlap > 0:
                # not lost
                lost = False
            else:
                # lost object
                append = 2
                frame_counter = idx + 5  # skip 5 frames
                lost_number = 1
                lost = True
        else:
            if overlap <= 0:
                lost_number = 1

    else:
        append = 0

    return append, lost_number, frame_counter
Beispiel #3
0
def test_update(cfg, dirmanager):
    print(cfg["TEST"]["DATASET"] + ' stage' + str(cfg["TEMPLATE"]["STEP"]))

    # load config

    # if cfg["TEST"]["DATASET"] == 'UAV123':
    #     dataset_root = '/home/lyuyu/dataset/UAV123/data_seq/UAV123/'
    # else:
    dataset_root = '/home/lyuyu/dataset/' + cfg["TEST"]["DATASET"]

    #  tracker
    model_path = cfg["MODEL"]["CHECKPOINT_PATH"]
    torch.cuda.set_device(cfg["TEST"]["GPU_ID"])
    # load tracker and updatenet
    tracker = tracker_builder.build_tracker(cfg)
    # update_path='./updatenet/checkpoint/checkpoint40.pth.tar'
    update_path = cfg["UPDATE"]["CHECKPOINT_PATH"]

    step = cfg["TEST"]["TYPE"]
    gpu_id = cfg["TEST"]["GPU_ID"]
    if cfg["UPDATE"]["MODEL"][:8] == "AAUNetv2":
        tracker = SiamTrackerAAUNetv2(cfg, tracker, update_path, gpu_id, step)
    elif cfg["UPDATE"]["MODEL"] == "UpdateNet":
        tracker = SiamTrackerUpdateNet(
            cfg, tracker, update_path, gpu_id,
            step)  #1=dasiamrpn; 2 linear; 3 updatenet
    else:
        raise NotImplementedError
    # create dataset
    dataset = DatasetFactory.create_dataset(name=cfg["TEST"]["DATASET"],
                                            dataset_root=dataset_root,
                                            load_img=False)

    # model_name = tracker.name
    model_name = update_path[63:-7].replace('/', '').replace('.', '')
    if step == 4:
        model_name = 'updatenet2016'
    elif step == 1:
        model_name = 'dasiamrpn'

    if cfg["TEST"]["DATASET"] in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        total_lost = 0
        #for v_idx, video in enumerate(dataset):
        if cfg["TEST"]["CLS_TYPE"] != 0:
            total_success_list = []
            total_iou_list = []
        for video in tqdm(dataset):
            # if args.video != '':
            #     # test one special video
            #     if video.name != args.video:
            #         continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            if cfg["TEST"]["CLS_TYPE"] != 0:
                iou_list = []
                success_list = []
            state = dict()
            for idx, (img, gt_bbox) in enumerate(video):
                # print(idx)
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:

                    state = tracker.init(img, np.array(gt_bbox))
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    pred_bbox = [cx - w / 2, cy - h / 2, w, h]
                    pred_bboxes.append(1)
                    if cfg["TEST"]["CLS_TYPE"] != 0:
                        iou_list.append(1)
                        success_list.append(1)
                elif idx > frame_counter:
                    # state = tracker.update(img, np.array(gt_bbox))
                    state = tracker.update(img)
                    pos = state['target_pos']  # cx, cy
                    sz = state['target_sz']  # w, h
                    pred_bbox = np.array(
                        [pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]])
                    #pred_bbox=np.array([pos[0]+1-(sz[0]-1)/2, pos[1]+1-(sz[1]-1)/2, sz[0], sz[1]])

                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    # iou = overlap_ratio(gt_bbox, pred_bbox)
                    if cfg["TEST"]["CLS_TYPE"] != 0:
                        if cfg["TEST"]["CLS_TYPE"] == 1:
                            if overlap > cfg["UPDATE"]["IOU_THRES"]:
                                iou = 1
                            else:
                                iou = 0
                        iou_list.append(iou)
                        success_list.append(state['success'])
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    if cfg["TEST"]["CLS_TYPE"] != 0:
                        iou_list.append(0)
                        success_list.append(0)
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if cfg["TEST"]["VISUALIZATION"] and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            if cfg["TEST"]["CLS_TYPE"] != 0:
                total_success_list = total_success_list + success_list
                total_iou_list = total_iou_list + iou_list
                success_list = np.array(success_list)
                iou_list = np.array(iou_list)

                # total accuracy & detect failure accuracy
                accuracy = np.mean(success_list == iou_list)
                index0 = np.argwhere(iou_list == 0)
                accuracy0 = np.mean(success_list[index0] == iou_list[index0])
                print(video.name, accuracy, accuracy0)
            toc /= cv2.getTickFrequency()
            # save results
            if cfg["SOLVER"]["LR_POLICY"] == 'epochwise_step_group':
                lr_type = cfg["UPDATE"]["CHECKPOINT_PATH"].split('/')[-2]
            elif cfg["SOLVER"]["LR_POLICY"] == 'cosine':
                lr_type = 'cosine'
            else:
                lr_type = 'undefined'
            if cfg["TEST"]["TYPE"] == 1:
                lr_type = 'base_dasiamrpn'
            video_path = os.path.join(dirmanager.updmod_res_dir,
                                      cfg["TEST"]["DATASET"], lr_type,
                                      model_name, 'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
            #         v_idx+1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        if cfg["TEST"]["CLS_TYPE"] == 1:
            total_success_list = np.array(total_success_list)
            total_iou_list = np.array(total_iou_list)

            # total accuracy & detect failure accuracy
            accuracy = np.mean(total_success_list == total_iou_list)
            index0 = np.argwhere(total_iou_list == 0)
            accuracy0 = np.mean(
                total_success_list[index0] == total_iou_list[index0])
            print('total accuracy', accuracy, accuracy0)
    # print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        #for v_idx, video in enumerate(dataset):
        if cfg["TEST"]["CLS_TYPE"] != 0:
            total_success_list = []
            total_iou_list = []
        for video in tqdm(dataset):
            # if args.video != '':
            #     # test one special video
            #     if video.name != args.video:
            #         continue

            toc = 0
            pred_bboxes = []
            if cfg["TEST"]["CLS_TYPE"] != 0:
                iou_list = []
                success_list = []
            scores = []
            track_times = []
            state = dict()
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:

                    state = tracker.init(
                        img, np.array(gt_bbox))  #注意gt_bbox和gt_bbox_的区别
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    pred_bbox = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    scores.append(None)
                    if 'VOT2018-LT' == cfg["TEST"]["DATASET"]:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                        if cfg["TEST"]["CLS_TYPE"] != 0:
                            iou_list.append(1)
                            success_list.append(1)
                    # if video.name == 'Jogging-1':
                    #     template_vis(state['z_f_cur'], 0, 'template_vis_'+str(idx))
                else:
                    state = tracker.update(img)

                    # if video.name == 'Jogging-1':
                    #     template_vis(state['z_f_cur'], 0, 'template_vis_'+str(idx))

                    pos = state['target_pos']
                    sz = state['target_sz']
                    pred_bbox = np.array(
                        [pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]])

                    pred_bboxes.append(pred_bbox)
                    #scores.append(outputs['best_score'])
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if cfg["TEST"]["CLS_TYPE"] != 0:
                        if cfg["TEST"]["CLS_TYPE"] == 1:
                            if overlap > 0.1:
                                iou = 1
                            else:
                                iou = 0
                        if cfg["TEST"]["CLS_TYPE"] == 2:
                            iou = overlap
                        iou_list.append(iou)
                        success_list.append(state['success'].cpu().numpy())

                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if cfg["TEST"]["VISUALIZATION"] and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            if cfg["TEST"]["CLS_TYPE"] != 0:
                total_success_list = total_success_list + success_list
                total_iou_list = total_iou_list + iou_list
                success_list = np.array(success_list)
                iou_list = np.array(iou_list)

                if cfg["TEST"]["CLS_TYPE"] == 1:
                    # total accuracy & detect failure accuracy
                    accuracy = np.mean(success_list == iou_list)
                    index0 = np.argwhere(iou_list == 0)
                    index1 = np.argwhere(iou_list == 1)
                    accuracy0 = np.mean(
                        success_list[index0] == iou_list[index0])
                    accuracy1 = np.mean(
                        success_list[index1] == iou_list[index1])
                if cfg["TEST"]["CLS_TYPE"] == 2:
                    # total accuracy & detect failure accuracy
                    comp_list = abs(success_list - iou_list) < 0.2
                    accuracy = np.mean(comp_list)
                    index0 = np.argwhere((success_list - iou_list) > 0)
                    index1 = np.argwhere((iou_list - success_list) > 0)
                    accuracy0 = np.mean(comp_list[index0])
                    accuracy1 = np.mean(comp_list[index1])
                print(video.name, accuracy, accuracy0, accuracy1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == cfg["TEST"]["DATASET"]:
                video_path = os.path.join('results', cfg["TEST"]["DATASET"],
                                          model_name, 'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == cfg["TEST"]["DATASET"]:
                video_path = os.path.join('results', cfg["TEST"]["DATASET"],
                                          model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                lr_type = 'cosine'
                video_path = os.path.join(dirmanager.updmod_res_dir,
                                          cfg["TEST"]["DATASET"], lr_type,
                                          model_name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                class_path = os.path.join(video_path,
                                          '{}_cls.txt'.format(video.name))
                with open(class_path, 'w') as f:
                    for x in success_list:
                        f.write(str(x) + '\n')
        # print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
        #    v_idx+1, video.name, toc, idx / toc))
        total_success_list = np.array(total_success_list)
        total_iou_list = np.array(total_iou_list)

        # total accuracy & detect failure accuracy
        if cfg["TEST"]["CLS_TYPE"] == 1:
            accuracy = np.mean(total_success_list == total_iou_list)
            index0 = np.argwhere(total_iou_list == 0)
            accuracy0 = np.mean(
                total_success_list[index0] == total_iou_list[index0])
        if cfg["TEST"]["CLS_TYPE"] == 2:
            comp_list = abs(total_success_list - total_iou_list) < 0.2
            accuracy = np.mean(comp_list)
            index0 = np.argwhere((total_success_list - total_iou_list) > 0)
            index1 = np.argwhere((total_iou_list - total_success_list) > 0)
            accuracy0 = np.mean(comp_list[index0])
            accuracy1 = np.mean(comp_list[index1])
        print('total accuracy', accuracy, accuracy0)
    # evaluation(cfg["TEST"]["DATASET"], model_name, dirmanager.updmod_res_dir)
    evaluation(
        cfg["TEST"]["DATASET"], model_name,
        os.path.join(dirmanager.updmod_res_dir, cfg["TEST"]["DATASET"],
                     lr_type))
    return
Beispiel #4
0
def track(video, tracker, visualize=False, data_collector=None):
    num_frames = len(video)
    frame_counter = 0
    frame_reset = 0  # used to indicate how many times the tracker was re-initialized
    lost_times = 0
    pred_bboxes = [
    ]  # Filled according to VOT protocol & used for metric calculation
    total_time = 0

    zero_tensor = torch.zeros(cfg.REFINE_TEMPLATE.FEATURE_SIZE,
                              dtype=torch.float32).cpu().data

    for f, (im, gt) in enumerate(video):
        if len(gt) == 4:
            gt = bbox_to_polygon(gt)

        cx, cy, w, h = get_axis_aligned_bbox(np.array(gt))

        start_time = cv2.getTickCount()
        if f == frame_counter:  # Init or reset after lost frame
            gt_bbox = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
            tracker.init(im, gt_bbox)
            pred_bbox = gt_bbox
            pred_bboxes.append(1)
            total_time += cv2.getTickCount() - start_time
            frame_reset = 0
            if data_collector:
                data_collector.add_init(f,
                                        num_frames,
                                        init_feat=tracker.zf_init)

        elif f > frame_counter:  # Tracking
            frame_reset += 1
            outputs = tracker.track(im)
            pred_bbox = outputs['bbox']
            if data_collector:
                # Extract ground-truth template features
                gt_rect = np.array([cx, cy, w, h])
                gt_zf = tracker.extract_template(
                    im, gt_rect) if w * h != 0 else zero_tensor
                data_collector.add_tracking(f,
                                            num_frames,
                                            frame_reset,
                                            cur_feat=outputs['zf'],
                                            pre_feat=tracker.zf,
                                            gt_feat=gt_zf)

            overlap = vot_overlap(pred_bbox, gt, (im.shape[1], im.shape[0]))
            if overlap > 0:
                pred_bboxes.append(pred_bbox)
            else:
                pred_bboxes.append(2)
                # skip 5 frames after object lost (as suggested by VOT)
                frame_counter = f + 5
                lost_times += 1
            total_time += cv2.getTickCount() - start_time

        elif f < frame_counter or w * h == 0:  # Skipping
            pred_bboxes.append(0)
            total_time += cv2.getTickCount() - start_time
            frame_reset = 0
            if data_collector:
                data_collector.add_init(f, num_frames, zero_tensor)

        if visualize:
            cv2.polylines(im, [np.array(gt, np.int).reshape((-1, 1, 2))], True,
                          (0, 255, 0), 3)

            bbox = list(map(int, pred_bbox))
            cv2.rectangle(im, (bbox[0], bbox[1]),
                          (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                          (0, 255, 255), 3)

            cv2.putText(im, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            window_name = 'test'
            cv2.imshow(window_name, im)
            cv2.moveWindow(window_name, 100, 10)
            cv2.waitKey(1)

    total_time /= cv2.getTickFrequency()
    cv2.destroyAllWindows()
    return {
        'pred_bboxes': pred_bboxes,
        'lost_times': lost_times,
        'total_time': total_time,
        'fps': f / total_time,
    }
Beispiel #5
0
def main():
    is_gpu_cuda_available = torch.cuda.is_available()
    if not is_gpu_cuda_available:
        raise RuntimeError(
            'Failed to locate a CUDA GPU. Program cannot continue..')
    num_gpus = torch.cuda.device_count()
    gpu_type = torch.cuda.get_device_name(0)
    print(f"You have {num_gpus} available of type: {gpu_type}")
    print("This might take a few minutes...Grab a cup of coffee\n")

    # load config
    cfg.merge_from_file(args.config)
    dataset_root = os.path.join(args.dataset_directory, args.dataset)
    print(f"dataset root-->{dataset_root}")

    # create model
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.model_name
    print(f"Model name is {model_name}")

    total_lost = 0
    if args.dataset in vot_like_dataset:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0.85:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + args.skip_frames  # skip 1 frame
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            save_path = os.path.join(args.results_path, args.dataset,
                                     model_name, args.experiment_name,
                                     video.name)
            if not os.path.isdir(save_path):
                os.makedirs(save_path)
            result_path = os.path.join(save_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            with open(os.path.join(save_path, '..', 'lost.txt'), 'a+') as f:
                f.write(
                    f"{v_idx+1} Class: {video.name} | Time: {toc}s | Speed: {idx/toc}fps | Lost:{lost_number}  \n"
                )

            print(
                '({:3d}) Class: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
        with open(os.path.join(save_path, '..', 'lost.txt'), 'a+') as f:
            f.write(
                f"Model architeture used --> {model_name} \ntotal lost: {total_lost} \n"
            )
            f.write(f"SKIP FRAMES USED --> {args.skip_frames}")
    else:
        # OPE tracking
        # will be implemented if needed in future
        pass
Beispiel #6
0
def main():

    # load config
    dataset_root='/home/ubuntu/pytorch/pytorch-tracking/DaSiamRPN/datasets/'+args.dataset
    #  tracker
    model_path ='./models/SiamRPNBIG.model'

    name='DaSiamRPN'

    gpu_id=0 #这里改成1不能正常运行

    tracker =  SiamRPNTracker(model_path,gpu_id)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    #算法的名字
    model_name = name

    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        total_lost=0
        #for v_idx, video in enumerate(dataset):
        for video in tqdm(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            state=dict()
            for idx, (img, gt_bbox) in enumerate(video):
               # print(idx)
                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                       gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    
                    state=tracker.init(img, np.array(gt_bbox))#注意gt_bbox和gt_bbox_的区别
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox)) #1-based
                    pred_bbox = [cx-(w)/2, cy-(h)/2, w, h]#1-based
                    
                    pred_bboxes.append(1)

                elif idx > frame_counter:
                    state = tracker.update(img) 
                    pos=state['target_pos']
                    sz=state['target_sz']
                    pred_bbox=np.array([pos[0]-sz[0]/2, pos[1]-sz[1]/2, sz[0], sz[1]])
                    #pred_bbox=np.array([pos[0]+1-(sz[0]-1)/2, pos[1]+1-(sz[1]-1)/2, sz[0], sz[1]])

                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5 # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                            True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))],
                                True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                    'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
            # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
            #         v_idx+1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
       # print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking    
        #for v_idx, video in enumerate(dataset):
        for video in tqdm(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            state=dict()
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
            
                    state=tracker.init(img, np.array(gt_bbox))#注意gt_bbox和gt_bbox_的区别
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    pred_bbox = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                   
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    state = tracker.update(img) 
                    pos=state['target_pos']
                    sz=state['target_sz']
                    pred_bbox=np.array([pos[0]-sz[0]/2, pos[1]-sz[1]/2, sz[0], sz[1]])
                    
                    pred_bboxes.append(pred_bbox)
                    #scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                  (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                        'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                        '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
Beispiel #7
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(args.dataset_dir, args.dataset)

    epsilon = args.epsilon

    # create model
    track_model1 = ModelBuilder()
    track_model2 = ModelBuilder()
    lr = args.lr

    # load model
    track_model1 = load_pretrain(track_model1, args.snapshot).cuda().eval()
    track_model2 = load_pretrain(track_model2, args.snapshot).cuda().eval()

    # build tracker
    tracker1 = build_tracker(track_model1)
    tracker2 = build_tracker(track_model2)
    attacker = ModelAttacker().cuda().train()
    optimizer = optim.Adam(attacker.parameters(), lr=lr)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False,
                                            dataset_toolkit='oneshot',
                                            config=cfg)
    #
    # vid.name = {'ants1','ants3',....}
    # img, bbox, cls, delta, delta_weight
    # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4]

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    n_epochs = args.epochs

    for name, param in tracker1.model.named_parameters():
        param.requires_grad_(False)

    for name, param in tracker2.model.named_parameters():
        param.requires_grad_(False)
    # for name, param in tracker2.model.named_parameters():
    #     if 'backbone' in name or 'neck' in name or 'rpn_head' in name:
    #         param.requires_grad_(False)
    #     elif param.requires_grad:
    #         param.requires_grad_(True)
    #         # print(name, param.data)
    #         print('grad true ', name)
    #     else:
    #         print('grad false ', name)

    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019', 'OTB100']:

        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
                else:
                    if not os.path.exists(
                            os.path.join(args.savedir, video.name)):
                        os.mkdir(os.path.join(args.savedir, video.name))

            # set writing video parameters
            height, width, channels = video[0][0].shape
            out = cv2.VideoWriter(
                os.path.join(args.savedir, video.name + '.avi'),
                cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15,
                (width, height))
            frame_counter = 0
            frame_counter_adv = 0
            lost_number = 0
            lost_number_adv = 0
            toc = 0
            total_toc = 0
            pred_bboxes = []
            pred_bboxes_adv = []
            lost = False
            lost_adv = False

            for i in range(0, args.epochs):

                for idx, (img, gt_bbox) in enumerate(video):

                    # if len(gt_bbox) == 4:
                    #     gt_bbox = [gt_bbox[0], gt_bbox[1],
                    #                gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1,
                    #                gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1,
                    #                gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]]

                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    ##########################################
                    # # #  for state of the art tracking # # #
                    ##########################################

                    if i == 0:
                        if idx == frame_counter:
                            init_gt = gt_bbox_
                            tracker1.init(img, gt_bbox_)
                            pred_bboxes.append(1)
                            zf = tracker1.zf
                            img1 = img.copy()

                        elif idx > frame_counter:

                            outputs = tracker1.track(img, idx=idx)

                            # print('****************** state of the art tracking ******************')
                            pred_bbox = outputs['bbox']
                            if args.dataset != 'OTB100':
                                overlap = vot_overlap(
                                    pred_bbox, gt_bbox,
                                    (img.shape[1], img.shape[0]))
                                if overlap > 0:
                                    # not lost
                                    pred_bboxes.append(pred_bbox)
                                    lost = False
                                else:
                                    # lost object
                                    pred_bboxes.append(2)
                                    frame_counter = idx + 5  # skip 5 frames
                                    lost_number += 1
                                    lost = True
                            else:
                                pred_bboxes.append(pred_bbox)

                        else:
                            pred_bboxes.append(0)

                    tic = cv2.getTickCount()
                    ##########################################
                    # # # # #  adversarial tracking  # # # # #
                    ##########################################

                    if idx == frame_counter_adv:
                        zimg = img.copy()
                        sz, bbox, pad = tracker2.init(img,
                                                      gt_bbox_,
                                                      attacker=attacker,
                                                      epsilon=args.epsilon)
                        pred_bboxes_adv.append(1)
                        zf2 = tracker2.zf

                        # cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg'), img)

                    elif idx > frame_counter_adv:

                        _outputs = tracker2.track(img,
                                                  attacker=attacker,
                                                  epsilon=args.epsilon,
                                                  zf=zf2,
                                                  idx=idx,
                                                  iter=i)
                        # print(_outputs['best_score'], outputs['target_score'])

                        # ad_bbox = _outputs['bbox']
                        # ad_overlap = vot_overlap(ad_bbox, gt_bbox, (img.shape[1], img.shape[0]))

                        # filename = os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg')
                        # save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox)

                        filename = os.path.join(
                            args.savedir, video.name,
                            'bb' + str(idx).zfill(6) + '.jpg')
                        save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox)
                        # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad,
                        #                  os.path.join(args.savedir, video.name, str(idx).zfill(6) + '.jpg'), save=True)

                        # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad, os.path.join(args.savedir, video.name, str(idx).zfill(6) +'.jpg'), save=True)

                        # update state
                        tracker2.center_pos = _outputs['center_pos']
                        tracker2.size = _outputs['size']

                        ad_overlap = vot_overlap(ad_bbox, gt_bbox,
                                                 (img.shape[1], img.shape[0]))

                        if args.dataset != 'OTB100':
                            if ad_overlap > 0:
                                # not lost
                                pred_bboxes_adv.append(ad_bbox)
                                lost_adv = False
                            else:
                                # lost object
                                pred_bboxes_adv.append(2)
                                frame_counter_adv = idx + 5  # skip 5 frames
                                lost_number_adv += 1
                                lost_adv = True
                        else:
                            if ad_overlap <= 0:
                                lost_number_adv += 1
                            pred_bboxes_adv.append(ad_bbox)
                    else:
                        pred_bboxes_adv.append(0)

                    toc += cv2.getTickCount() - tic

                    # if idx > frame_counter_adv and not lost_adv:
                    #     ad_bbox = list(map(int, ad_bbox))
                    #     cv2.rectangle(img, (ad_bbox[0], ad_bbox[1]),
                    #                   (ad_bbox[0] + ad_bbox[2], ad_bbox[1] + ad_bbox[3]), (0, 0, 255), 3)
                    #
                    # if idx > frame_counter and not lost:
                    #     bbox = list(map(int, pred_bbox))
                    #     cv2.rectangle(img, (bbox[0], bbox[1]),
                    #               (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3)
                    #
                    # __gt_bbox = list(map(int, gt_bbox_))
                    # cv2.rectangle(img, (__gt_bbox[0], __gt_bbox[1]),
                    #               (__gt_bbox[0]+__gt_bbox[2], __gt_bbox[1]+__gt_bbox[3]), (0, 0, 0), 3)
                    #
                    # cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
                    # cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    # cv2.putText(img, ","+str(lost_number_adv), (80, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    #
                    # out.write(img)

                    # print('frame {}/{}  Lost: {:d}'.format(idx, len(video), lost_number_adv))

                toc /= cv2.getTickFrequency()

                print(
                    '({:3d}) Video: {:12s} train{}/{} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                    .format(v_idx + 1, video.name, i, args.epochs, toc,
                            idx / toc, lost_number_adv))

                total_toc += toc

                l1 = _outputs['l1']
                l2 = _outputs['l2']
                l3 = _outputs['l3']
                # total_loss = 0.8 * l1 + 0.4 * l2 + 1.2 * l3
                total_loss = l1 + 0.4 * l2
                # total_loss = l1 + 0.4 * l2

                # print(idx, i, total_loss.item(), _outputs['center_pos'], _outputs['size'])

                # if ad_overlap < 0.5:
                if _outputs['best_score'] < outputs['target_score']:
                    total_loss_val = 0
                    # print(idx, i, ad_overlap)
                    # print(ad_bbox)
                    # print(pred_bbox)
                    # print('------------------------')
                    # filename = os.path.join(args.savedir, video.name, 'bb' + str(idx).zfill(6) + '.jpg')
                    # save_2bb(img, filename, ad_bbox, pred_bbox, gt_bbox)
                    # _zimg = save(zimg, tracker2.z_crop_adv, sz, init_gt, pad,
                    #              os.path.join(args.savedir, video.name, str(idx).zfill(6) + '.jpg'), save=True)
                    # pdb.set_trace()
                    break
                else:
                    # print(_outputs['bbox'])
                    optimizer.zero_grad()
                    total_loss.backward(retain_graph=True)
                    optimizer.step()

            total_toc /= (i + 1)

            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time Average: {:4.1f}s Speed Average: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, total_toc, idx / total_toc,
                        lost_number_adv))
            total_lost += lost_number_adv
        print("{:s} total lost: {:d}".format(model_name, total_lost))
Beispiel #8
0
def objective(trial):
    # different params
    cfg.TRACK.WINDOW_INFLUENCE = trial.suggest_uniform('window_influence',
                                                       0.050, 0.650)
    cfg.TRACK.PENALTY_K = trial.suggest_uniform('penalty_k', 0.000, 0.600)
    cfg.TRACK.LR = trial.suggest_uniform('scale_lr', 0.100, 0.800)
    cfg.TRACK.COEE_CLASS = trial.suggest_uniform('coee_class', 0.01, 0.999)

    # rebuild tracker
    info = edict()
    info.arch = args.arch
    info.cls_type = args.cls_type
    info.dataset = args.dataset
    tracker = SiamRPN(info)

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    tracker_name = os.path.join('tune_results', args.dataset, model_name, model_name + \
                                '_wi-{:.3f}'.format(cfg.TRACK.WINDOW_INFLUENCE) + \
                                '_pk-{:.3f}'.format(cfg.TRACK.PENALTY_K) + \
                                '_lr-{:.3f}'.format(cfg.TRACK.LR)+\
                                '_ce-{:.3f}'.format(cfg.TRACK.COEE_CLASS)
                                )
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                # if len(gt_bbox) == 4:
                #     gt_bbox = [gt_bbox[0], gt_bbox[1],
                #                gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1,
                #                gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1,
                #                gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    #gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz, net)
                    state["arch"] = args.arch
                    # pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    pred_bbox = location
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join(tracker_name, 'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
        eao = eval(dataset=dataset_eval, tracker_name=tracker_name)
        info = "{:s} window_influence: {:1.17f}, penalty_k: {:1.17f}, scale_lr: {:1.17f}, EAO: {:1.3f}".format(
            model_name, cfg.TRACK.WINDOW_INFLUENCE, cfg.TRACK.PENALTY_K,
            cfg.TRACK.LR, eao)
        logging.getLogger().info(info)
        print(info)
        return eao
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    # tracker.init(img, gt_bbox_)

                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    pred_bbox = location

                    # outputs = tracker.track(img)
                    # pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(state['score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                if not os.path.isdir(tracker_name):
                    os.makedirs(tracker_name)
                result_path = os.path.join(tracker_name,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
        auc = eval(dataset=dataset_eval, tracker_name=tracker_name)
        info = "{:s} window_influence: {:1.17f}, penalty_k: {:1.17f}, scale_lr: {:1.17f}, AUC: {:1.3f}".format(
            model_name, cfg.TRACK.WINDOW_INFLUENCE, cfg.TRACK.PENALTY_K,
            cfg.TRACK.LR, auc)
        logging.getLogger().info(info)
        print(info)
        return auc
Beispiel #9
0
def main():
    # load config
    cfg_from_file(args.config)

    dataset_root = os.path.join('dataset', args.dataset)

    # create model
    net = ModelBuilder()
    checkpoint = torch.load(args.model)
    if 'state_dict' in checkpoint:
        net.load_state_dict(checkpoint['state_dict'])
    else:
        net.load_state_dict(checkpoint)
    net.cuda().eval()
    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.save_name
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    target_pos, target_sz = np.array([cx,
                                                      cy]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_polygon = [
                        pred_bbox[0], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2], pred_bbox[1],
                        pred_bbox[0] + pred_bbox[2],
                        pred_bbox[1] + pred_bbox[3], pred_bbox[0],
                        pred_bbox[1] + pred_bbox[3]
                    ]
                    overlap = vot_overlap(gt_bbox, pred_polygon,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('result', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    if 'OTB' in args.dataset:
                        target_pos, target_sz = rect1_2_cxy_wh(gt_bbox)
                    else:
                        cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                        target_pos, target_sz = np.array([cx, cy
                                                          ]), np.array([w, h])
                    state = CGACD_init(img, target_pos, target_sz, net)
                    if 'OTB' in args.dataset:
                        pred_bbox = cxy_wh_2_rect1(state['target_pos'],
                                                   state['target_sz'])
                    else:
                        pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                                  state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                else:
                    state = CGACD_track(state, img)
                    pred_bbox = cxy_wh_2_rect(state['target_pos'],
                                              state['target_sz'])
                    pred_bboxes.append(pred_bbox)
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    target_pos = state['target_pos']
                    target_sz = state['target_sz']
                    cv2.rectangle(img, (int(target_pos[0] - target_sz[0] / 2),
                                        int(target_pos[1] - target_sz[1] / 2)),
                                  (int(target_pos[0] + target_sz[0] / 2),
                                   int(target_pos[1] + target_sz[1] / 2)),
                                  (0, 255, 0), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2,
                                cv2.LINE_AA)
                    cv2.imshow(video.name, img)
                    cv2.moveWindow(video.name, 100, 100)
                    key = cv2.waitKey(1)
                    if key == 27:
                        break
            toc /= cv2.getTickFrequency()
            if 'GOT-10k' == args.dataset:
                video_path = os.path.join('result', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #10
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(args.dataset_dir, args.dataset)

    epsilon = args.epsilon

    # create model
    model = Steath([1, 3, 255, 255])
    track_model = ModelBuilder()
    lr = args.lr

    # load model
    model = load_pretrain(model, args.snapshot).cuda()
    track_model = load_pretrain(track_model, args.snapshot).cuda().eval()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    model.train()
    # model.dx.requires_grad_(True)
    # model.backbone.eval()
    # if cfg.ADJUST.ADJUST:
    #     model.neck.eval()
    # model.rpn_head.eval()

    for name, param in model.named_parameters():

        if 'backbone' in name or 'neck' in name or 'rpn_head' in name:
            param.requires_grad_(False)
        elif param.requires_grad:
            param.requires_grad_(True)
            print(name, param.data)
        else:
            print(name)

    clipper = WeightClipper(5)

    # build tracker
    tracker1 = build_tracker(track_model)
    tracker2 = build_tracker(track_model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False,
                                            config=cfg)
    #
    # vid.name = {'ants1','ants3',....}
    # img, bbox, cls, delta, delta_weight
    # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4]

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    n_epochs = args.epochs

    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:

        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
                else:
                    if not os.path.exists(os.path.join(args.savedir, video.name)):
                        os.mkdir(os.path.join(args.savedir, video.name))

            # set writing video parameters
            height, width, channels = video[0][0].shape
            out = cv2.VideoWriter(os.path.join(args.savedir, video.name + '.avi'),
                                  cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15, (width, height))
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            data = {'template': None, 'search': None}

            for idx, (img, gt_bbox, z, x, szx, boxx, padx, cls, delta, delta_w, overlap, _bbox, _bbox_p) in enumerate(video):

                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                       gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]

                tic = cv2.getTickCount()

                cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                gt_bbox_ = [cx-w//2, cy-h//2, w, h]

                if idx == frame_counter:
                    tracker1.init(img, gt_bbox_)
                    tracker2.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)

                    data['template'] = torch.autograd.Variable(z, requires_grad=True).cuda()

                elif idx > frame_counter:
                    prim_img = np.copy(img)
                    data['search'] = torch.autograd.Variable(x, requires_grad=True).cuda()
                    data['label_cls'] = torch.Tensor(cls).type(torch.LongTensor).cuda()
                    data['label_loc'] = torch.Tensor(delta).type(torch.FloatTensor).cuda()
                    data['label_loc_weight'] = torch.Tensor(delta_w).cuda()

                    diff = data['search']

                    for epoch in range(n_epochs):
                        outputs = model(data, epsilon)
                        cls_loss = outputs['cls_loss']
                        # print(idx, epoch, cls_loss.item())
                        loc_loss = outputs['loc_loss']
                        total_loss = outputs['total_loss']

                        print('{}/{} cls={}, loc={}, total={}'.format(idx, len(video), cls_loss.item(), loc_loss.item(),
                                                                      total_loss.item()))

                        optimizer.zero_grad()
                        # cls_loss.backward()
                        total_loss.backward()
                        # model.apply(clipper)
                        optimizer.step()

                        # print('loss ', loss(diff, outputs['search']).item())
                        # diff = outputs['search']

                    # print(epoch, cls_loss, loc_loss, total_loss)
                    # print('{}/{} cls={}, loc={}, total={}'.format(idx, len(video), cls_loss.item(), loc_loss.item(),
                    #                                               total_loss.item()))
                    perturb_data = outputs['search']

                    # cv2.rectangle(img, (int(cx-w/2+1), int(cy-h/2+1)), (int(cx+w/2+1), int(cy+h/2+1)), (0, 0, 0), 3)
                    # cv2.imwrite(os.path.join(args.savedir, video.name, 'original_' + str(idx).zfill(7) + '.jpg'), img)

                    # _img = perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])
                    # cv2.imwrite(os.path.join(args.savedir, 'perturb_' + str(idx) + '.jpg'), _img)

                    szx = int(szx)

                    if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, szx):
                        perturb_data = F.interpolate(perturb_data, size=szx)
                        __bbox = (np.array(_bbox_p)*szx/cfg.TRACK.INSTANCE_SIZE).astype(np.int)

                    _img = cv2.UMat(perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])).get()
                    cv2.rectangle(_img, (__bbox[0], __bbox[1]), (__bbox[2], __bbox[3]), (0, 0, 0), 3)
                    cv2.imwrite(os.path.join(args.savedir, video.name, 'crop_full_' + str(idx) + '.jpg'), _img)

                    nh, nw, _ = _img.shape

                    __bbox0 = np.zeros_like(__bbox)
                    __bbox0[:4:2] = __bbox[:4:2] - padx[0]
                    __bbox0[1:4:2] = __bbox[1:4:2] - padx[2]

                    img[boxx[0]:boxx[1] + 1, boxx[2]:boxx[3] + 1, :] = \
                        _img[boxx[0]+padx[0]:boxx[1]+padx[0] + 1, 0 + padx[2]:boxx[3] - boxx[2] + padx[2] + 1, :]
                    # cv2.imwrite(os.path.join(args.savedir, video.name, 'perturb_full_' + str(idx) + '.jpg'), img)

                    # if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, sz):
                    #     perturb_data = F.interpolate(perturb_data, size=sz)
                    #     __bbox = (np.array(_bbox)*sz/cfg.TRACK.INSTANCE_SIZE).astype(np.uint8)
                    #
                    # _img = cv2.UMat(perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])).get()
                    # cv2.rectangle(_img, (__bbox[0], __bbox[1]), (__bbox[2], __bbox[3]), (0, 0, 0), 3)
                    # cv2.imwrite(os.path.join(args.savedir, video.name, 'crop_full_' + str(idx) + '.jpg'), _img)
                    #
                    # nh, nw, _ = _img.shape
                    # img[bT:bB+1, bL:bR+1, :] = _img[pad[0]:nh - pad[1], pad[2]:nw - pad[3], :]
                    # cv2.imwrite(os.path.join(args.savedir, video.name, 'perturb_full_' + str(idx) + '.jpg'), img)

                    # nimg, sz, box, pad = tracker2.crop(img, bbox=gt_bbox_, im_name='search' + str(idx))

                    outputs = tracker1.track(img)
                    prim_outputs = tracker2.track(prim_img)

                    pred_bbox = outputs['bbox']
                    prim_box = prim_outputs['bbox']

                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5 # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)

                # cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(7) + '.jpg'), img)

                toc += cv2.getTickCount() - tic

                # write ground truth bbox
                cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                              True, (255, 255, 255), 3)

                if idx != frame_counter:
                    bbox = list(map(int, pred_bbox))
                    prim_bbox = list(map(int, prim_box))

                    cv2.rectangle(img, (bbox[0], bbox[1]),
                                  (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3)

                    cv2.rectangle(img, (prim_bbox[0], prim_bbox[1]),
                                  (prim_bbox[0] + prim_bbox[2], prim_bbox[1] + prim_bbox[3]), (0, 0, 255), 3)


                cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

                out.write(img)
                cv2.imwrite(os.path.join(args.savedir, video.name, str(idx).zfill(7) + '.jpg'), img)

                # import pdb
                # pdb.set_trace()

            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                    'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
                    v_idx+1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                  (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                        'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                        '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
                v_idx+1, video.name, toc, idx / toc))
Beispiel #11
0
def vot_evaluate(dataset, tracker):
    tracker_name = args.tracker
    backbone_name = args.cfg.split('/')[-1].split('_')[0]
    snapshot_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    for v_idx, video in enumerate(dataset):
        if args.video != '':  # if test special video
            if video.name != args.video:
                continue
        frame_count = 0
        lost_number = 0
        pred_bboxes = []
        toc = 0
        for idx, (frame, gt_bbox) in enumerate(video):
            tic = cv2.getTickCount()
            if idx == frame_count:
                tracker.init(frame, gt_bbox)  # cx,cy,w,h
                pred_bboxes.append(1)
            elif idx > frame_count:
                track_result = tracker.track(frame)
                bbox = track_result['bbox']  # cx,cy,w,h
                score = track_result['score']
                bbox_ = [
                    bbox[0] - bbox[2] / 2, bbox[1] - bbox[3] / 2, bbox[2],
                    bbox[3]
                ]  # x,y,w,h
                gt_bbox_ = [
                    gt_bbox[0] - (gt_bbox[2] - 1) / 2,
                    gt_bbox[1] - (gt_bbox[3] - 1) / 2, gt_bbox[2], gt_bbox[3]
                ]
                overlap = vot_overlap(bbox_, gt_bbox_,
                                      (frame.shape[1], frame.shape[0]))
                # print('idx: {}\n pred: {}\n gt: {}\n overlap: {}\n'.format(idx, bbox_, gt_bbox_, overlap))
                if overlap > 0:
                    pred_bboxes.append(bbox_)
                else:
                    # print('lost idx: {}'.format(idx))
                    pred_bboxes.append(2)
                    frame_count = idx + 5
                    lost_number += 1
            else:
                pred_bboxes.append(0)

            toc += cv2.getTickCount() - tic
            if args.vis and idx > frame_count:
                show_double_bbox(frame, bbox, score, gt_bbox, idx, lost_number)
        toc /= cv2.getTickFrequency()
        result_dir = os.path.join(cfg.TRACK.RESULT_DIR, args.dataset,
                                  tracker_name, backbone_name, snapshot_name)
        if not os.path.isdir(result_dir):
            os.makedirs(result_dir)
        result_path = '{}/{}.txt'.format(result_dir, video.name)
        with open(result_path, 'w') as f:
            for x in pred_bboxes:
                if isinstance(x, int):
                    f.write('{:d}\n'.format(x))
                else:
                    f.write(','.join(['{:.4f}'.format(i) for i in x]) + '\n')
        # log
        total_lost += lost_number

        print('[{:d}/{:d}] | video: {:12s} | time: {:4.1f}s | speed: {:3.1f}fps | lost_number: {:d} ' \
              .format(v_idx + 1, len(dataset), video.name, toc, idx / toc, lost_number))
    print('total_lost: {}'.format(total_lost))
Beispiel #12
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpuid)
    snap_shot = './checkpoints/model0_e19.pth' #

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(cur_dir, '../../pysot/testing_dataset', args.dataset) #'LaSOT')#

    # create model
    model = ManModelBuilder(out_ch=1024, relu=True).cuda()
    # load model
    model = load_pretrain(model, snap_shot)
    torch.set_grad_enabled(False)

    # build tracker
    tracker = ManTracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=True)

    model_name = snap_shot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video and args.video != '%d' % v_idx:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                               gt_bbox[0], gt_bbox[1] + gt_bbox[3] - 1,
                               gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1] + gt_bbox[3] - 1,
                               gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_min_max_bbox(np.array(gt_bbox))  # get_axis_aligned_bbox
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                                  True, (0, 255, 0), 3)
                    bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (bbox[0], bbox[1]),
                                  (bbox[0] + bbox[2], bbox[1] + bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
                v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video and args.video != '%d' % v_idx:
                    continue
            if 'LaSOT' in args.dataset:
                model_path = os.path.join('results', args.dataset, model_name)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                if os.path.exists(result_path):
                    print("pass " + video.name)
                    continue

            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []

            video.load_img()
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - w / 2, cy - h/ 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    if not np.isnan(gt_bbox).any():
                        gt_bbox = list(map(int, gt_bbox))
                        cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                      (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]), (0, 255, 0), 3)
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2], pred_bbox[1] + pred_bbox[3]), (0, 0, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, cv2.resize(img, (480, 360)))
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            video.free_img()

            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)#, '{}'.format(video.name))
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                # result_path = os.path.join(model_path, '{}_001.txt'.format(video.name))
                # model_path2 = os.path.join('results', args.dataset, model_name)
                # shutil.copyfile(result_path, result_path2)
                # if os.path.exists(result_path2):
                #     print("success copy" + video.name)
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
                v_idx + 1, video.name, toc, idx / toc))
Beispiel #13
0
def run_tracker(tracker, gt, video_name, restart=True):
    frame_count = 0
    lost_number = 0
    pred_bboxes = []
    toc = 0
    if restart:
        for idx, (frame, gt_bbox) in enumerate(video):
            tic = cv2.getTickCount()
            if idx == frame_count:
                tracker.init(frame, gt_bbox)  # cx,cy,w,h
                pred_bboxes.append(1)
            elif idx > frame_count:
                track_result = tracker.track(frame)
                bbox = track_result['bbox']  # cx,cy,w,h
                score = track_result['score']
                bbox_ = [bbox[0] - bbox[2] / 2, bbox[1] - bbox[3] / 2, bbox[2], bbox[3]]  # x,y,w,h
                gt_bbox_ = [gt_bbox[0] - gt_bbox[2] / 2, gt_bbox[1] - gt_bbox[3] / 2, gt_bbox[2], gt_bbox[3]]
                if vot_overlap(bbox_, gt_bbox_, (frame.shape[1], frame.shape[0])) > 0:
                    pred_bboxes.append(bbox_)
                else:
                    pred_bboxes.append(2)
                    frame_count = idx + 5
                    lost_number += 1
            else:
                pred_bboxes.append(0)

            toc += cv2.getTickCount() - tic
            if args.vis and idx > frame_count:
                show_double_bbox(frame, bbox, score, gt_bbox, idx, lost_number)
        toc /= cv2.getTickFrequency()
        # log
        print('video: {}, time: {:.1f}s, speed: {:.1f}fps, lost_number: {:d} '.format(video_name,
                                                                                      toc, idx / toc,
                                                                                      lost_number))
        return pred_bboxes
    else:
        # toc = 0
        # pred_bboxes = []
        # scores = []
        # track_times = []
        # for idx, (img, gt_bbox) in enumerate(video):
        #     tic = cv2.getTickCount()
        #     if idx == 0:
        #         cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
        #         gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
        #         tracker.init(img, gt_bbox_)
        #         pred_bbox = gt_bbox_
        #         scores.append(None)
        #         pred_bboxes.append(pred_bbox)
        #     else:
        #         outputs = tracker.track(img)
        #         pred_bbox = outputs['bbox']
        #         pred_bboxes.append(pred_bbox)
        #         scores.append(outputs['best_score'])
        #     toc += cv2.getTickCount() - tic
        #     track_times.append((cv2.getTickCount() - tic) / cv2.getTickFrequency())
        # toc /= cv2.getTickFrequency()
        # print('Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
        #     video_name, toc, idx / toc))
        # return pred_bboxes, scores, track_times
        pass
Beispiel #14
0
def main():
    # refine_method = args.refine_method
    model_name = 'siamrpn_' + refine_method
    model_path = '/'
    snapshot_path = os.path.join(
        project_path_, 'experiments/%s/model.pth' % args.tracker_name)
    config_path = os.path.join(
        project_path_, 'experiments/%s/config.yaml' % args.tracker_name)

    cfg.merge_from_file(config_path)
    dataset_root = dataset_root_

    # create model
    '''a model is a Neural Network.(a torch.nn.Module)'''
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, snapshot_path).cuda().eval()

    # build tracker
    '''a tracker is a object, which consists of not only a NN but also some post-processing'''
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    '''##### build a refinement module #####'''
    if 'RF' in refine_method:
        RF_module = RefineModule(refine_path,
                                 selector_path,
                                 branches=branches,
                                 search_factor=sr,
                                 input_sz=input_sz)

    elif refine_method == 'iou_net':
        RF_info = Tracker('iou_net', 'iou_net_dimp', None)
        RF_params = RF_info.get_parameters()
        RF_params.visualization = False
        RF_params.debug = False
        RF_params.visdom_info = {
            'use_visdom': False,
            'server': '127.0.0.1',
            'port': 8097
        }
        RF_module = RF_info.tracker_class(RF_params)

    elif refine_method == 'mask':
        RF_module = siammask()
    else:
        raise ValueError("refine_method should be 'RF' or 'iou' or 'mask' ")
    # model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    H, W, _ = img.shape
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    '''##### initilize refinement module for specific video'''
                    if 'RF' in refine_method:
                        RF_module.initialize(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB),
                            np.array(gt_bbox_))
                    elif refine_method == 'iou_net':
                        gt_bbox_np = np.array(gt_bbox_)
                        gt_bbox_torch = torch.from_numpy(
                            gt_bbox_np.astype(np.float32))
                        init_info = {}
                        init_info['init_bbox'] = gt_bbox_torch
                        RF_module.initialize(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB), init_info)
                    elif refine_method == 'mask':
                        RF_module.initialize(img, np.array(gt_bbox_))
                    else:
                        raise ValueError(
                            "refine_method should be 'RF' or 'RF_mask' or 'iou_net' or 'mask' "
                        )
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    '''##### refine tracking results #####'''
                    if 'RF' in refine_method or refine_method == 'iou_net':
                        pred_bbox = RF_module.refine(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB),
                            np.array(pred_bbox))
                        x1, y1, w, h = pred_bbox.tolist()
                        '''add boundary and min size limit'''
                        x1, y1, x2, y2 = bbox_clip(x1, y1, x1 + w, y1 + h,
                                                   (H, W))
                        w = x2 - x1
                        h = y2 - y1
                        pred_bbox = np.array([x1, y1, w, h])
                        '''pass new state back to base tracker'''
                        tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2])
                        tracker.size = np.array([w, h])
                    elif refine_method == 'mask':
                        pred_bbox, center_pos, size = RF_module.refine(
                            img, np.array(pred_bbox), VOT=True)
                        # boundary and min size limit have been included in "refine"
                        '''pass new state back to base tracker'''
                        '''pred_bbox is a list with 8 elements'''
                        tracker.center_pos = center_pos
                        tracker.size = size
                    else:
                        raise ValueError(
                            'refine_method should be RF or iou or mask')
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if refine_method == 'mask':
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join(save_dir, args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if video.name + '.txt' in os.listdir(model_path):
                continue
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    H, W, _ = img.shape
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    '''##### initilize refinement module for specific video'''
                    if 'RF' in refine_method:
                        RF_module.initialize(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB),
                            np.array(gt_bbox_))
                    elif refine_method == 'iou_net':
                        gt_bbox_np = np.array(gt_bbox_)
                        gt_bbox_torch = torch.from_numpy(
                            gt_bbox_np.astype(np.float32))
                        init_info = {}
                        init_info['init_bbox'] = gt_bbox_torch
                        RF_module.initialize(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB), init_info)
                    elif refine_method == 'mask':
                        RF_module.initialize(img, np.array(gt_bbox_))
                    else:
                        raise ValueError(
                            "refine_method should be 'RF' or 'iou' or 'mask' ")
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    '''##### refine tracking results #####'''
                    if 'RF' in refine_method or refine_method == 'iou_net':
                        pred_bbox = RF_module.refine(
                            cv2.cvtColor(img, cv2.COLOR_BGR2RGB),
                            np.array(pred_bbox))
                    elif refine_method == 'mask':
                        pred_bbox = RF_module.refine(img,
                                                     np.array(pred_bbox),
                                                     VOT=False)
                    else:
                        raise ValueError(
                            "refine_method should be 'RF' or 'iou' or 'mask' ")
                    x1, y1, w, h = pred_bbox.tolist()
                    '''add boundary and min size limit'''
                    x1, y1, x2, y2 = bbox_clip(x1, y1, x1 + w, y1 + h, (H, W))
                    w = x2 - x1
                    h = y2 - y1
                    pred_bbox = np.array([x1, y1, w, h])
                    tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2])
                    tracker.size = np.array([w, h])

                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join(save_dir, args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join(save_dir, args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join(
                    save_dir, args.dataset,
                    model_name + '_' + str(selector_path))
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #15
0
def main():
    # load config
    cfg.merge_from_file(args.config)
    
    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset)
    
    # create model
    model = Model2021()
    
    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()
    
    # build tracke
    tracker = build_tracker(model)
    
    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    
    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0 
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            overlaps1 = []
            vars1 = []
            vars0 = []
            occl1 = []
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
    
            frame_width = 960#img.shape[1]
            frame_height = 540#img.shape[0]
            video_loc = os.path.join('../results', model_name, video.name)
    
            out = cv2.VideoWriter(video_loc+'.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height),True)
            if video.tags['occlusion']==[] or (np.array(video.tags['occlusion'])==1).sum()==0:
                print("\t\tdiscard occlusion")
                continue
                video.tags['occlusion'] = video.tags['all']
    
            for idx, (img, gt_bbox) in enumerate(video):
                   
                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                       gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()

                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    box1 = gt_bbox_
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                    if idx == 0:
                        print(img.shape)
                elif idx > frame_counter:
                    outputs = tracker.track(img, mode)
                    pred_bbox = outputs['bbox']
                    
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
    #######################################################################################
                    cx, cy, w, h  = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    box2 = gt_bbox_                
                    w1, h1 = box1[2], box1[3]
                    w2, h2 = box2[2], box2[3]
                    cx1, cy1 = (img.shape[1]//2, img.shape[0]//2)
                    cx2, cy2 = (box2[2]/2+box2[0], box2[3]/2+box2[1])
    #                box1 = box2
                    # scale variation
                    s1 = np.sqrt(w1*h1)
                    s2 = np.sqrt(w2*h2)    
                    sv = max(s1/s2, s2/s1)
                    
                    # aspect ratio variation
                    r1, r2 = h1/w1, h2/w2
                    arv = max(r1/r2, r2/r1)
                    
                    # fast motion
                    fm = np.sqrt((cx2-cx1)**2+(cy2-cy1)**2)/np.sqrt(s1*s2)
                    vars0.append(np.array([sv, arv, fm, outputs['cls2']]))
                    # occlusion
    #########################################################################################
     #               print(idx, outputs['var'], np.array([sv, arv, fm]))  ##################################
                    overlaps1.append(overlap)
                    vars1.append(outputs['cls2'])
                    if idx<=len(video.tags['occlusion']):
                        occl1.append(video.tags['occlusion'][idx])
                    else:
                        occl1.append(np.zeros(idx-len(video.tags['occlusion'])))
                    if overlap > 0.0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                     #   print("-------loss---------")
                        pred_bboxes.append(2)
                        frame_counter = idx + 5 # skip 5 frames
                        lost_number += 1
                        for l in range(0,5):
                            vars1.append(-0.2)
                            occl1.append(-0.2)
                else:
                    pred_bboxes.append(0)
                    
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    
                    cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                            True, (255, 0, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))],
                                True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    print(idx)
                    cv2.putText(img, 'occl_gt:'+str(video.tags['occlusion'][idx-1]), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    cv2.putText(img, 'proposed_TL:'+str(lost_number), (40, 160), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    
                    cv2.putText(img, 'occl_pred:'+str(vars1[idx-1]), (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    out.write(img)
                    cv2.imwrite(video_loc+str(idx)+'.png',img)
                    cv2.waitKey(1)
    
            toc /= cv2.getTickFrequency()
            # save results
            out.release()
            video_path = os.path.join(args.results, args.dataset, model_name,
                    'baseline', video.name)
    
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}, mIOU: {:0.4f}'.format(
                    v_idx+1, video.name, toc, idx / toc, lost_number, np.array(overlaps1).mean()))
    #        plt.plot(overlaps1)
    #        plt.plot(np.array(vars0)[:,3])
  #          plt.plot(np.array(occl1))

  #          plt.plot(np.array(vars1))
    #        print(np.correlate(overlaps1,np.array(vars1)[:,2]))
            overlaps2.append(np.array(overlaps1).mean())
   #         occl2.append(np.array(occl1))
   #         vars2.append(np.array(vars1))
 #           if args.video != '':
#                v_idx=0
#            print(100*(confusion_matrix(occl2[v_idx],vars2[v_idx]).ravel()))
                
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    
    # cv2.destroyAllWindows()
    # print("Total Mean IOU is   %0.4f"%np.array(overlaps2).mean())

    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                  (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.rewaitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('../results', args.dataset, model_name,
                        'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                        '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('../results', args.dataset, model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('../results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
                v_idx+1, video.name, toc, idx / toc ))
Beispiel #16
0
def main(frame_interval, interpolation_rate):
    # load config
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset)

    # create model
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 0, 255), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (255, 0, 0), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    window_name = "Result"
                    cv2.moveWindow(window_name, 100, 100)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # # save results
            # video_path = os.path.join('results', args.dataset, model_name,
            #         'baseline', video.name)
            # if not os.path.isdir(video_path):
            #     os.makedirs(video_path)
            # result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            # with open(result_path, 'w') as f:
            #     for x in pred_bboxes:
            #         if isinstance(x, int):
            #             f.write("{:d}\n".format(x))
            #         else:
            #             f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
            # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
            #         v_idx+1, video.name, toc, idx / toc, lost_number))
            # total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # FPS List
        fps_list = []

        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []

            # PARAMETERS
            # frame_interval = 2
            # interpolation_rate = 0.005

            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w,
                                h]  # (left-top width height)
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']  # (left-top width height)
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])

                    ######################################
                    # Adaptive Template(exemplar update) #
                    ######################################
                    if idx % frame_interval == 0:
                        tracker.update_z(img,
                                         pred_bbox,
                                         interpolation_rate=interpolation_rate)

                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (255, 0, 0),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    window_name = "Result"
                    cv2.moveWindow(window_name, 20, 20)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()

            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            # OTB-100 HERE!!!!!!!!!!!!!!
            else:
                result_folder_name = "results_{0:d}frame_exemplar_update_rate_{1:s}".format(
                    frame_interval, str(interpolation_rate))
                model_path = os.path.join(result_save_base_path,
                                          result_folder_name, args.dataset,
                                          model_name)
                # model_path = os.path.join(result_save_base_path, 'results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')

            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))

            # FPS Result
            fps = idx / toc
            fps_list.append(fps)

        # Make FPS Result Path
        fps_array = np.asarray(fps_list).reshape(-1, 1)
        fps_file_name = "model_fps__[{:3.1f}].txt".format(
            np.average(fps_array))
        model_fps_file = os.path.join(os.path.dirname(model_path), "../",
                                      fps_file_name)
        np.savetxt(model_fps_file, fps_array)
Beispiel #17
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    #!!! input your dataset path
    dataset_root = os.path.join(your_dataset_path, args.dataset)

    # create model
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    model_name = model_name + '_pk-{:.3f}'.format(
        cfg.TRACK.PENALTY_K) + '_wi-{:.3f}'.format(
            cfg.TRACK.WINDOW_INFLUENCE) + '_lr-{:.3f}'.format(cfg.TRACK.LR)
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue

            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic

            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())

            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #18
0
def main():
    '''change save_path to yours'''
    save_path = '/home/masterbin-iiau/Desktop/AdvTrack-project/supplementary/%s' % args.video
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    # load config
    cfg.merge_from_file(args.config)

    dataset_root = os.path.join(dataset_root_, args.dataset)
    # create model
    '''a model is a Neural Network.(a torch.nn.Module)'''
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker
    '''a tracker is a object, which consists of not only a NN but also some post-processing'''
    tracker = build_tracker(model)
    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    # model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    '''GAN'''
                    outputs = tracker.track_supp(img, GAN, save_path, idx)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()

    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track_supp(img, GAN, save_path, idx)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
Beispiel #19
0
def main():
    net = models.__dict__[args.arch](anchors_nums=args.anchor_nums,
                                     cls_type=args.cls_type)
    net = load_pretrain(net, args.resume)
    net.eval()
    net = net.cuda()

    # prepare tracker
    info = edict()
    info.arch = args.arch
    info.cls_type = args.cls_type
    info.dataset = args.dataset
    info.epoch_test = args.epoch_test
    tracker = SiamRPN(info)

    dataset_root = os.path.join("/ssd", args.dataset)
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    model_name = args.resume.split('/')[-1].split('.')[0]
    total_lost = 0
    """
    eao will lower than origin version(0.393->0.390) due to the  
    number of digits after the decimal point
    """
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                # if len(gt_bbox) == 4:
                #     gt_bbox = [gt_bbox[0], gt_bbox[1],
                #        gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                #        gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    #gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)
                    #pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    #outputs = tracker.track(img)
                    pred_bbox = location
                    #overlap=poly_iou(gt_bbox,location)
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]

                    target_pos = np.array([cx, cy])
                    target_sz = np.array([w, h])
                    state = tracker.init(img, target_pos, target_sz,
                                         net)  # init tracker
                    state["arch"] = args.arch
                    #tracker.init(img, gt_bbox_)

                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    state = tracker.track(state, img)  # track
                    location = cxy_wh_2_rect(state['target_pos'],
                                             state['target_sz'])

                    pred_bbox = location

                    #outputs = tracker.track(img)
                    #pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(state['score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('result', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #20
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(args.dataset_dir, args.dataset)

    epsilon = args.epsilon

    # create model
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().train()

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False,
                                            config=cfg)
    #
    # vid.name = {'ants1','ants3',....}
    # img, bbox, cls, delta, delta_weight
    # vid[0][0],vid[0][1],vid[0][2],vid[0][3],vid[0][4]

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0

    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:

        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue

            # set writing video parameters
            height, width, channels = video[0][0].shape
            out = cv2.VideoWriter(
                os.path.join(args.savedir, video.name + '.avi'),
                cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 15,
                (width, height))
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            data = {'template': None, 'search': None}
            for idx, (img, gt_bbox, cls, delta_cls, delta_w, _bbox, cls_s, delta_cls_s, delta_w_s, _bbox_s) \
                    in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))

                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)

                    nimg, sz, box, _ = tracker.crop(img,
                                                    bbox=gt_bbox_,
                                                    im_name='exemplar')
                    data['template'] = torch.autograd.Variable(
                        nimg, requires_grad=True).cuda()
                elif idx > frame_counter:

                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    nimg, sz, box, pad = tracker.crop(img,
                                                      bbox=gt_bbox_,
                                                      is_template=False,
                                                      im_name='search' +
                                                      str(idx))
                    [bT, bB, bL, bR] = box
                    sz = int(sz)
                    data['search'] = torch.autograd.Variable(
                        nimg, requires_grad=True).cuda()
                    data['label_cls'] = torch.Tensor(cls_s).type(
                        torch.LongTensor).cuda()
                    data['label_loc'] = torch.Tensor(delta_cls_s).type(
                        torch.FloatTensor).cuda()
                    data['label_loc_weight'] = torch.Tensor(delta_w_s).cuda()

                    outputs = model(data)

                    cls_loss = outputs['cls_loss']
                    loc_loss = outputs['loc_loss']
                    total_loss = outputs['total_loss']
                    total_loss.backward()

                    data_grad = data['search'].grad

                    # torch.Tensor(img.transpose([2, 0, 1])).unsqueeze(dim=0)

                    perturb_data = fgsm_attack(data['search'], epsilon,
                                               data_grad)
                    # cv2.imwrite(os.path.join(args.savedir, 'original_' + str(idx) + '.jpg'), img)

                    # _img = perturb_data.data.cpu().numpy().squeeze().transpose([1, 2, 0])
                    # cv2.imwrite(os.path.join(args.savedir, 'perturb_' + str(idx) + '.jpg'), _img)

                    if not np.array_equal(cfg.TRACK.INSTANCE_SIZE, sz):
                        perturb_data = F.interpolate(perturb_data, size=sz)

                    _img = perturb_data.data.cpu().numpy().squeeze().transpose(
                        [1, 2, 0])
                    # cv2.imwrite(os.path.join(args.savedir, 'crop_full_' + str(idx) + '.jpg'), _img)
                    nh, nw, _ = _img.shape
                    img[bT:bB + 1, bL:bR + 1, :] = _img[pad[0]:nh - pad[1],
                                                        pad[2]:nw - pad[3], :]
                    # cv2.imwrite(os.path.join(args.savedir, 'perturb_full_' + str(idx) + '.jpg'), img)

                    outputs = tracker.track(img)

                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        print('*************** lost ***************')
                        import pdb
                        pdb.set_trace()
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1

                    print(idx, torch.sum(data_grad, (2, 3)))
                    print(
                        idx,
                        torch.sum(torch.abs(torch.sum(data_grad, (2, 3))),
                                  (0, 1)))

                else:
                    pred_bboxes.append(0)

                toc += cv2.getTickCount() - tic

                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)

                # save tracking image
                bbox = list(map(int, pred_bbox))
                cv2.rectangle(img, (bbox[0], bbox[1]),
                              (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                              (0, 255, 255), 3)
                cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX,
                            1, (0, 255, 255), 2)
                cv2.putText(img, str(lost_number), (40, 80),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                # cv2.imwrite(os.path.join(args.savedir, 'track_' + str(idx) + '.jpg'), img)
                out.write(img)

            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #21
0
def main():
    # load config
    # save_siamese_rpn()
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    # dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset)
    dataset_root = datasets_root + args.dataset

    # create model
    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # save_backbone(model)

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0

    #multi-pass tracking,跟踪丢失后重新初始化的测试方法
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0

            # pred_bboxes包含两种类型的数据,类型1:整型数据,有1,2,0,三个值,分别表示跟踪开始,跟踪结束(丢失),跟踪丢失之后,间隔帧的占位符
            # 类型2:浮点类型的bbox,也就是跟踪结果
            pred_bboxes = []

            gru_seq_len = tracker.model.grus.seq_in_len
            video_len = len(video)

            for idx, (img, gt_bbox) in enumerate(video):

                if len(
                        gt_bbox
                ) == 4:  #如果gt是【x,y,w,h】的方式,转化为8个坐标信息(x1,y1,x2,y2,x3,y3,x4,y4)
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()

                #跟踪初始化
                if idx == frame_counter:  #   跟踪第一帧初始化
                    idxs = list(
                        map(lambda x, y: x + y, [idx] * gru_seq_len,
                            list(range(
                                0,
                                gru_seq_len))))  # 取出idx后面的gru_seq_len个序列的索引号
                    idxs = list(map(lambda x: min(x, video_len - 1),
                                    idxs))  # 避免索引号越界

                    tracker.template_idx = 0  #模板初始化的第一帧
                    for k in idxs:
                        init_img, init_gt_bbox = video[k]  #连续gru_seq_len帧初始化
                        #init_img, init_gt_bbox =video[idxs[0]]     #只用一帧作为初始化参数

                        cx, cy, w, h = get_axis_aligned_bbox(
                            np.array(init_gt_bbox)
                        )  #将倾斜框4个点坐标,转化为bbox,x,y为中心点形式(cx,cy,w,h)
                        init_gt_bbox = [
                            cx - (w - 1) / 2, cy - (h - 1) / 2, w, h
                        ]  #x,y,中心点形式,转化为左上角形式

                        tracker.init_gru(init_img, init_gt_bbox, k)

                    if k == 0:
                        pred_bbox = init_gt_bbox
                        pred_bboxes.append(1)

                #持续的后续跟踪
                elif idx > frame_counter:
                    outputs = tracker.track(img)  #对于下面的帧
                    pred_bbox = outputs['bbox']

                    #只有输出概率很高的时候才更新模板
                    if outputs['best_score'] > 0.95:
                        tracker.init_gru(img, pred_bbox, idx)

                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))

                    #查看初始化后的第一帧检测iou和score之间的关系
                    # if tracker.template_idx==4:
                    #     print("{:3.2f}\t{:3.2f}".format(overlap,outputs['best_score']))

                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)

                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()

                #绘制输出框,gt和mask都按照多边形来绘制,跟踪的bbox按照矩形来绘制
                if args.vis and idx > frame_counter:
                    #绘制多边形的gt
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    #绘制siamesemask输出的多边形
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    #绘制输出矩形框
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)

                    #添加图像标注,帧号和丢失次数
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            #结果路径的构成: ./results/VOT2018/model/baseline/ants1/ants1_001.txt
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))

            #pred_bboxes包含两种类型的数据,类型1:整型数据,有1,2,0,三个值,分别表示跟踪开始,跟踪结束(丢失),跟踪丢失之后,间隔帧的占位符
            # 类型2:浮点类型的bbox,也就是跟踪结果
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):  #整数代表开始,或者有丢失
                        f.write("{:d}\n".format(x))
                    else:  #浮点数才是bbox
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))

    #oetracking,跟踪丢失后不再重新初始化的测试方法
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:

                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #22
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    # -------------------------------------hp_search---------------------------------------#
    params = [0.0, 0.0, 0.0]

    # Interpolation learning rate
    params[0] = cfg.TRACK.LR
    # Scale penalty
    params[1] = cfg.TRACK.PENALTY_K
    # Window influence
    params[2] = cfg.TRACK.WINDOW_INFLUENCE

    params_name = args.snapshot.split(
        '/')[-1] + ' ' + args.dataset + '  lr-' + str(
            params[0]) + '  pk-' + '_' + str(params[1]) + '  win-' + '_' + str(
                params[2])

    # -------------------------------------hp_search---------------------------------------#
    # cur_dir = os.path.dirname(os.path.realpath(__file__))

    dataset_root = os.path.join('./datasets', args.dataset)

    model = ModelBuilder()

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker  siamos
    tracker = SiamCARTracker(model, cfg.TRACK)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        total_lost = 0
        avg_speed = 0  # linlin
        for v_idx, video in tqdm(enumerate(dataset)):
            #for v_idx, video in tqdm(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w,
                                h]  #[topx,topy,w,h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join(args.save_path, args.dataset,
                                      args.tracker_name, 'baseline',
                                      video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            # print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
            #         v_idx+1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
            avg_speed += idx / toc

        print('Speed: {:3.1f}fps'.format(avg_speed / 60))
        print(params_name)

        #print(" stage:{:d} model:{:s} epoch:{:s} update_lr:{:f}".format(args.update_stage,args.update_path, args.update_path.split('/')[-1],update_lr[args.update_lr]))

    else:
        # OPE tracking
        for v_idx, video in tqdm(enumerate(dataset)):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w,
                                h]  #[topx,topy,w,h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    #scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 255),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()

            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join(args.save_path, args.dataset,
                                          args.tracker_name, 'longterm',
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join(args.save_path, args.dataset,
                                          args.tracker_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join(args.save_path, args.dataset,
                                          args.tracker_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            # print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
            #     v_idx+1, video.name, toc, idx / toc))
        print(params_name)
    # os.chdir(model_path)
    # save_file = '../%s' % dataset
    # shutil.make_archive(save_file, 'zip')
    #print('Records saved at', save_file + '.zip')
    evaluate(args)
Beispiel #23
0
def main(args, tracker):

    # create dataset
    if not args.dataset_path:
        args.dataset_path = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(args.dataset_path, 'dataset', args.dataset)

    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False,
                                            single_video=args.video)

    model_name = args.model_name

    if args.debug_vis:
        args.vis = True

    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019', 'VOT2020']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue

            video_total_lost = 0
            for cnt in range(args.repetition):
                frame_counter = 0
                lost_number = 0
                toc = 0
                init_toc = 0
                valid_frames = 0
                pred_bboxes = []

                template_image = None
                search_image = None
                raw_heatmap = None
                post_heatmap = None

                for idx, (img, gt_bbox) in enumerate(video):
                    if len(gt_bbox) == 4:
                        gt_bbox = [gt_bbox[0], gt_bbox[1],
                                   gt_bbox[0], gt_bbox[1]+gt_bbox[3],
                                   gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3],
                                   gt_bbox[0]+gt_bbox[2], gt_bbox[1]]
                    tic = cv2.getTickCount()
                    if idx == frame_counter:
                        cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                        gt_bbox_ = [cx - w/2, cy - h/2, w, h]
                        tracker.init(img, gt_bbox_)
                        init_toc += cv2.getTickCount() - tic
                        pred_bbox = gt_bbox_
                        pred_bboxes.append(1)

                    elif idx > frame_counter:
                        outputs = tracker.track(img)
                        pred_bbox = outputs['bbox']
                        pred_bbox = [pred_bbox[0], pred_bbox[1],
                                     pred_bbox[2] - pred_bbox[0],
                                     pred_bbox[3] - pred_bbox[1]]

                        valid_frames += 1
                        toc += cv2.getTickCount() - tic

                        overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))

                        if overlap > 0:
                            # not lost
                            pred_bboxes.append(pred_bbox)
                        else:
                            # lost object
                            pred_bboxes.append(2)
                            frame_counter = idx + 5 # skip 5 frames
                            lost_number += 1

                            if args.vis and args.debug_vis:

                                cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 2))], True, (0, 255, 0), 3)

                                bbox = list(map(int, pred_bbox))
                                cv2.rectangle(img, (bbox[0], bbox[1]),
                                              (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                                cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                                cv2.putText(img, 'lost', (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                                cv2.imshow(video.name, img)

                                for key, value in outputs.items():
                                    if isinstance(value, np.ndarray):
                                        if len(value.shape) == 3 or len(value.shape) == 2:
                                            cv2.imshow(key, value)

                                k = cv2.waitKey(0)
                                if k == 27:         # wait for ESC key to exit
                                    sys.exit()

                    else:
                        pred_bboxes.append(0)
                    if idx == 0:
                        if args.vis:
                            cv2.destroyAllWindows()
                    if args.vis and idx > frame_counter:
                        cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 2))], True, (0, 255, 0), 3)

                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                        cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                        cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                        cv2.imshow(video.name, img)

                        if args.debug_vis:

                            for key, value in outputs.items():
                                if isinstance(value, np.ndarray):
                                    if len(value.shape) == 3 or len(value.shape) == 2:
                                        cv2.imshow(key, value)

                            k = cv2.waitKey(0)
                            if k == 27:         # wait for ESC key to exit
                                break
                        else:
                            k = cv2.waitKey(1)
                            if k == 27:         # wait for ESC key to exit
                                break

                    sys.stderr.write("inference on {}:  {} / {}\r".format(video.name, idx+1, len(video)))

                toc /= cv2.getTickFrequency()
                init_toc /= cv2.getTickFrequency()
                # save results
                video_path = os.path.join(args.result_path, args.dataset, model_name,
                        'baseline', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_{:03d}.txt'.format(video.name, cnt+1))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        if isinstance(x, int):
                            f.write("{:d}\n".format(x))
                        else:
                            f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
                log = '({:3d}) Video ({:2d}): {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
                        v_idx+1, cnt+1, video.name, init_toc + toc, valid_frames / toc, lost_number)
                print(log)
                with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f:
                    f.write(log + '\n')
                video_total_lost += lost_number
            total_lost += video_total_lost
            if args.repetition > 1:
                log = '({:3d}) Video: {:12s} Avg Lost: {:.3f}'.format(v_idx+1, video.name, video_total_lost/args.repetition)
                print(log)
                with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f:
                    f.write(log + '\n')

        log = "{:s} total (avg) lost: {:.3f}".format(model_name, total_lost/args.repetition)
        print(log)
        with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f:
            f.write(log + '\n')
    else:
        # OPE tracking

        find_best = True

        if not dataset.has_ground_truth:
            find_best = False

        # if repeat 3 times for GOT-10k, use the official benchmark mode (no find best)
        if args.dataset == 'GOT-10k':
            if args.repetition == 3:
                find_best = False

        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue

            best_pred_bboxes = []
            min_lost_number = 1e6
            for cnt in range(args.repetition):
                toc = 0
                init_toc = 0
                pred_bboxes = []
                track_times = []
                template_image = None
                search_image = None
                raw_heatmap = None
                post_heatmap = None
                lost_number = 0

                if find_best and min_lost_number < args.min_lost_rate_for_repeat * len(video):
                    print("Abolish reset of trails ({}~) becuase the min lost number is small enough: {} / {}".format(cnt+1 , min_lost_number, args.min_lost_rate_for_repeat * len(video)))
                    break

                save_image_offset = 0
                if args.save_image_num_per_video > 1:
                    save_image_offset = len(video) // (args.save_image_num_per_video - 1)
                if args.save_image_num_per_video == 0:
                    save_image_offset = 1

                for idx, (img, gt_bbox) in enumerate(video):
                    tic = cv2.getTickCount()
                    if idx == 0:
                        outputs = tracker.init(img, gt_bbox)
                        init_toc += cv2.getTickCount() - tic
                        pred_bbox = gt_bbox
                        pred_bboxes.append(pred_bbox)
                    else:
                        outputs = tracker.track(img)
                        toc += cv2.getTickCount() - tic
                        pred_bbox_ = outputs['bbox']
                        pred_bbox = [pred_bbox_[0], pred_bbox_[1],
                                     pred_bbox_[2] - pred_bbox_[0],
                                     pred_bbox_[3] - pred_bbox_[1]]
                        pred_bboxes.append(pred_bbox)

                    track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())

                    gt_bbox_int = list(map(lambda x: int(x) if not np.isnan(x) else 0, gt_bbox))
                    pred_bbox_int = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox_int[0], gt_bbox_int[1]),
                                  (gt_bbox_int[0]+gt_bbox_int[2], gt_bbox_int[1]+gt_bbox_int[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox_int[0], pred_bbox_int[1]),
                                  (pred_bbox_int[0]+pred_bbox_int[2], pred_bbox_int[1]+pred_bbox_int[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    if save_image_offset > 0:

                        image_path = os.path.join(args.result_path, args.dataset, model_name, 'images', video.name)
                        if not os.path.isdir(image_path):
                            os.makedirs(image_path)

                        if idx % save_image_offset == 0:
                            imagename = os.path.join(image_path,  'image{:03d}.jpg'.format(idx))
                            cv2.imwrite(imagename,img)


                    if idx == 0:
                        if args.vis:
                            cv2.destroyAllWindows()
                            if args.debug_vis and isinstance(outputs, dict):
                                for key, value in outputs.items():
                                    if isinstance(value, np.ndarray):
                                        if len(value.shape) == 3 or len(value.shape) == 2:
                                            cv2.imshow(key, value)
                    else:
                        if not gt_bbox == [0,0,0,0] and not np.isnan(np.array(gt_bbox)).any():
                            if pred_bbox[0] + pred_bbox[2] < gt_bbox[0] or pred_bbox[0] > gt_bbox[0] + gt_bbox[2] or pred_bbox[1] + pred_bbox[3] < gt_bbox[1] or pred_bbox[1] > gt_bbox[1] + gt_bbox[3]:
                                lost_number += 1

                        if find_best and lost_number > min_lost_number:
                            break


                        if args.vis or args.debug_vis:
                            cv2.imshow(video.name, img)

                            if args.debug_vis:

                                for key, value in outputs.items():
                                    if isinstance(value, np.ndarray):
                                        if len(value.shape) == 3 or len(value.shape) == 2:
                                            cv2.imshow(key, value)

                                k = cv2.waitKey(0)
                                if k == 27:         # wait for ESC key to exit
                                    min_lost_number = 1e6 # this allows to  try args.repetition times for debug
                                    lost_number = 1e6 # this allows to  try args.repetition times for debug
                                    break
                            else:
                                k = cv2.waitKey(1)
                                if k == 27:         # wait for ESC key to exit
                                    min_lost_number = 1e6 # this allows to  try args.repetition times for debug
                                    lost_number = 1e6 # this allows to  try args.repetition times for debug
                                    break

                    sys.stderr.write("inference on {}:  {} / {}\r".format(video.name, idx+1, len(video)))

                if find_best and lost_number > min_lost_number:
                    print('Stop No.{} trial becuase the lost number already exceed the min lost number: {} > {} '.format(cnt+1, lost_number, min_lost_number))
                    continue

                if lost_number == 1e6:
                    continue

                if lost_number < min_lost_number:
                    min_lost_number = lost_number

                toc /= cv2.getTickFrequency()
                init_toc /= cv2.getTickFrequency()
                # save results
                if 'GOT-10k' == args.dataset:
                    video_path = os.path.join(args.result_path, args.dataset, model_name, video.name)
                    if not os.path.isdir(video_path):
                        os.makedirs(video_path)
                    id = cnt + 1
                    if find_best:
                        id = 1
                    result_path = os.path.join(video_path, '{}_{:03d}.txt'.format(video.name, id))
                    with open(result_path, 'w') as f:
                        for x in pred_bboxes:
                            f.write(','.join([vot_float2str("%.4f", i) for i in x ])+'\n')
                    result_path = os.path.join(video_path,
                            '{}_time.txt'.format(video.name))
                    with open(result_path, 'w') as f:
                        for x in track_times:
                            f.write("{:.6f}\n".format(x))
                else:
                    model_path = os.path.join(args.result_path, args.dataset, model_name)
                    if not os.path.isdir(model_path):
                        os.makedirs(model_path)
                    result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                    with open(result_path, 'w') as f:
                        for x in pred_bboxes:
                            f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')

                log = '({:3d}) Video: {:12s} Trail: {:2d}  Time: {:5.1f}s Speed: {:3.1f}fps Lost: {:d}/{:d}'.format(
                    v_idx+1, video.name, cnt+1, init_toc + toc, idx / toc, lost_number, len(video))
                print(log)
                with open(os.path.join(args.result_path, args.dataset, model_name, 'log.txt'), 'a') as f:
                    f.write(log + '\n')
Beispiel #24
0
def run_tracker(tracker, img, gt, video_name, video, restart=True):
    frame_counter = 0
    lost_number = 0
    toc = 0
    pred_bboxes = []
    if restart:  # VOT2016 and VOT 2018
        for idx, (img, gt_bbox) in enumerate(video):
            if len(gt_bbox) == 4:
                gt_bbox = [
                    gt_bbox[0], gt_bbox[1], gt_bbox[0],
                    gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1,
                    gt_bbox[1] + gt_bbox[3] - 1, gt_bbox[0] + gt_bbox[2] - 1,
                    gt_bbox[1]
                ]
            tic = cv2.getTickCount()
            if idx == frame_counter:
                cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                tracker.init(img, gt_bbox_)
                pred_bbox = gt_bbox_
                pred_bboxes.append([1])
            elif idx > frame_counter:
                outputs = tracker.track(img)
                pred_bbox = outputs['bbox']
                overlap = vot_overlap(pred_bbox, gt_bbox,
                                      (img.shape[1], img.shape[0]))
                if overlap > 0:
                    # not lost
                    pred_bboxes.append(pred_bbox)
                else:
                    # lost object
                    pred_bboxes.append([2])
                    frame_counter = idx + 5  # skip 5 frames
                    lost_number += 1
            else:
                pred_bboxes.append([0])
            toc += cv2.getTickCount() - tic
        toc /= cv2.getTickFrequency()
        # print('Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(video_name, toc, idx / toc, lost_number))
        return pred_bboxes
    else:
        toc = 0
        pred_bboxes = []
        scores = []
        track_times = []
        for idx, (img, gt_bbox) in enumerate(video):
            tic = cv2.getTickCount()
            if idx == 0:
                cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                tracker.init(img, gt_bbox_)
                pred_bbox = gt_bbox_
                scores.append(None)
                pred_bboxes.append(pred_bbox)
            else:
                outputs = tracker.track(img)
                pred_bbox = outputs['bbox']
                pred_bboxes.append(pred_bbox)
                scores.append(outputs['best_score'])
            toc += cv2.getTickCount() - tic
            track_times.append(
                (cv2.getTickCount() - tic) / cv2.getTickFrequency())
        toc /= cv2.getTickFrequency()
        # print('Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(video_name, toc, idx / toc))
        return pred_bboxes, scores, track_times
Beispiel #25
0
def main():
    # load config
    model_name = 'RT_MDNet_refine'
    MASK = False

    dataset_root = os.path.join(dataset_root_, args.dataset)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)
    '''##### build a scale-estimator #####'''
    SE_module = Scale_Estimator_bcm(refine_checkpoint_dir_)

    # model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [
                        gt_bbox[0], gt_bbox[1], gt_bbox[0],
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1,
                        gt_bbox[1] + gt_bbox[3] - 1,
                        gt_bbox[0] + gt_bbox[2] - 1, gt_bbox[1]
                    ]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    H, W, _ = img.shape
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.init(img, gt_bbox_)
                    '''##### initilize scale-estimator for specific video'''
                    # SE_module.initialize(cv2.cvtColor(img,cv2.COLOR_BGR2RGB),
                    #                      np.array(gt_bbox_))

                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    '''##### refine tracking results #####'''
                    # output_dict = SE_module.refine_all(cv2.cvtColor(img,cv2.COLOR_BGR2RGB),
                    #                                 np.array(pred_bbox))
                    # pred_bbox = 0.5 * (output_dict['bbox'] + output_dict['corner'])
                    lr = outputs['lr']
                    pred_bbox = tracker.smooth_bbox(pred_bbox, lr, H, W)
                    x1, y1, w, h = pred_bbox
                    tracker.center_pos = np.array([x1 + w / 2, y1 + h / 2])
                    tracker.size = np.array([w, h])

                    # if cfg.MASK.MASK:
                    #     pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox,
                                          (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5  # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(
                        img, [np.array(gt_bbox, np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                    if MASK:
                        cv2.polylines(
                            img,
                            [np.array(pred_bbox, np.int).reshape(
                                (-1, 1, 2))], True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
                                      (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                                      'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path,
                                       '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i)
                                          for i in x]) + '\n')
            print(
                '({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'
                .format(v_idx + 1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if v_idx < 60:
                continue
            tracker = RT_MDNet()
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                img_RGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB format

                tic = cv2.getTickCount()
                if idx == 0:
                    H, W, _ = img.shape
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx - (w - 1) / 2, cy - (h - 1) / 2, w, h]
                    tracker.initialize_seq(img_RGB, np.array(gt_bbox_))
                    '''##### initilize scale-estimator for specific video'''
                    SE_module.initialize(img_RGB, np.array(gt_bbox_))
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    ori_bbox = tracker.track(img_RGB)
                    '''##### refine tracking results #####'''
                    output_dict = SE_module.refine_all(img_RGB,
                                                       np.array(ori_bbox))
                    pred_bbox = 0.5 * (output_dict['bbox'] +
                                       output_dict['corner'])
                    pred_bbox = bbox_clip(pred_bbox, (H, W))
                    tracker.target_bbox = pred_bbox.copy()

                    pred_bboxes.append(pred_bbox)
                    # scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append(
                    (cv2.getTickCount() - tic) / cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    ori_bbox = list(map(int, ori_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(
                        img, (gt_bbox[0], gt_bbox[1]),
                        (gt_bbox[0] + gt_bbox[2], gt_bbox[1] + gt_bbox[3]),
                        (0, 0, 255), 3)
                    cv2.rectangle(
                        img, (ori_bbox[0], ori_bbox[1]),
                        (ori_bbox[0] + ori_bbox[2], ori_bbox[1] + ori_bbox[3]),
                        (255, 0, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0] + pred_bbox[2],
                                   pred_bbox[1] + pred_bbox[3]), (0, 255, 0),
                                  3)
                    cv2.putText(img, str(idx), (40, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(
                    video_path, '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write(
                            "{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                                          video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                                           '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
                result_path = os.path.join(video_path,
                                           '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path,
                                           '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x]) + '\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.
                  format(v_idx + 1, video.name, toc, idx / toc))
Beispiel #26
0
def main():
    # load config
    cfg.merge_from_file(args.config)

    cur_dir = os.path.dirname(os.path.realpath(__file__))
    dataset_root = os.path.join(cur_dir, '../testing_dataset', args.dataset)

    # create model
    model = ModelBuilder(cfg)

    # load model
    model = load_pretrain(model, args.snapshot).cuda().eval()

    # build tracker
    tracker = build_tracker(model)

    # create dataset
    dataset = DatasetFactory.create_dataset(name=args.dataset,
                                            dataset_root=dataset_root,
                                            load_img=False)

    model_name = args.snapshot.split('/')[-1].split('.')[0]
    total_lost = 0
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        # restart tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0
            pred_bboxes = []
            for idx, (img, gt_bbox) in enumerate(video):
                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                       gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                       gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        # not lost
                        pred_bboxes.append(pred_bbox)
                    else:
                        # lost object
                        pred_bboxes.append(2)
                        frame_counter = idx + 5 # skip 5 frames
                        lost_number += 1
                else:
                    pred_bboxes.append(0)
                toc += cv2.getTickCount() - tic
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                            True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))],
                                True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                      (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            video_path = os.path.join('results', args.dataset, model_name,
                    'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:4.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(
                    v_idx+1, video.name, toc, idx / toc, lost_number))
            total_lost += lost_number
        print("{:s} total lost: {:d}".format(model_name, total_lost))
    else:
        # OPE tracking
        for v_idx, video in enumerate(dataset):
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset:
                        pred_bboxes.append([1])
                    else:
                        pred_bboxes.append(pred_bbox)
                else:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    scores.append(outputs['best_score'])
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0:
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                  (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                  (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()
            # save results
            if 'VOT2018-LT' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name,
                        'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                        '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join('results', args.dataset, model_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join('results', args.dataset, model_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
            print('({:3d}) Video: {:12s} Time: {:5.1f}s Speed: {:3.1f}fps'.format(
                v_idx+1, video.name, toc, idx / toc))
Beispiel #27
0
def main(): 
    
    cfg.merge_from_file(args.config) 

    dataset_root = os.path.join('./datasets', args.dataset) 
                  
    params = [0.0,0.0,0.0]
    
    params[0] =cfg.TRACK.LR 
    params[1]=cfg.TRACK.PENALTY_K
    params[2] =cfg.TRACK.WINDOW_INFLUENCE 

    params_name = args.snapshot.split('/')[-1] + ' '+ args.dataset + '  lr-' + str(params[0]) + '  pk-' + '_' + str(params[1]) + '  win-' + '_' + str(params[2])
    
    # create model 
    model = ModelBuilder() 

    # load model 
    model = load_pretrain(model, args.snapshot).cuda().eval()
    
    # build tracker 
    tracker = build_tracker(model)
    
    # create dataset 
    dataset = DatasetFactory.create_dataset(name=args.dataset,  
                                            dataset_root=dataset_root,
                                            load_img=False)  
    
    if args.dataset in ['VOT2016', 'VOT2018', 'VOT2019']:
        total_lost=0
        avg_speed =0  
        for v_idx, video in tqdm(enumerate(dataset)):
            if args.video != '':
                if video.name != args.video:
                    continue
            frame_counter = 0
            lost_number = 0
            toc = 0 
            pred_bboxes = [] 
            for idx, (img, gt_bbox) in enumerate(video): 
                if len(gt_bbox) == 4:
                    gt_bbox = [gt_bbox[0], gt_bbox[1],
                    gt_bbox[0], gt_bbox[1]+gt_bbox[3]-1,
                    gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]+gt_bbox[3]-1,
                    gt_bbox[0]+gt_bbox[2]-1, gt_bbox[1]]
                tic = cv2.getTickCount()
                if idx == frame_counter: 
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] #[topx,topy,w,h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    pred_bboxes.append(1)
                elif idx > frame_counter:
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    if cfg.MASK.MASK:
                        pred_bbox = outputs['polygon']
                    overlap = vot_overlap(pred_bbox, gt_bbox, (img.shape[1], img.shape[0]))
                    if overlap > 0:
                        pred_bboxes.append(pred_bbox) 
                    else: 
                        pred_bboxes.append(2)
                        frame_counter = idx + 5 
                        lost_number += 1 
                else:
                    pred_bboxes.append(0) 
                toc += cv2.getTickCount() - tic 
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > frame_counter:
                    cv2.polylines(img, [np.array(gt_bbox, np.int).reshape((-1, 1, 2))],
                            True, (0, 255, 0), 3)
                    if cfg.MASK.MASK:
                        cv2.polylines(img, [np.array(pred_bbox, np.int).reshape((-1, 1, 2))],
                                True, (0, 255, 255), 3)
                    else:
                        bbox = list(map(int, pred_bbox))
                        cv2.rectangle(img, (bbox[0], bbox[1]),
                                    (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.putText(img, str(lost_number), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency() 
            # save results
            video_path = os.path.join(args.save_path, args.dataset, args.tracker_name,
                    'baseline', video.name)
            if not os.path.isdir(video_path):
                os.makedirs(video_path)
            result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
            with open(result_path, 'w') as f:
                for x in pred_bboxes:
                    if isinstance(x, int):
                        f.write("{:d}\n".format(x))
                    else:
                        f.write(','.join([vot_float2str("%.4f", i) for i in x])+'\n')

            total_lost += lost_number 
            avg_speed += idx / toc

        print('Speed: {:3.1f}fps'.format(avg_speed/60))
        print(params_name)
        
    else:
    # OPE tracking
        for v_idx, video in tqdm(enumerate(dataset)): 
            if args.video != '':
                # test one special video
                if video.name != args.video:
                    continue
            toc = 0
            pred_bboxes = []
            scores = []
            track_times = []
            for idx, (img, gt_bbox) in enumerate(video):
                tic = cv2.getTickCount()
                if idx == 0:
                    cx, cy, w, h = get_axis_aligned_bbox(np.array(gt_bbox))
                    gt_bbox_ = [cx-(w-1)/2, cy-(h-1)/2, w, h] #[topx,topy,w,h]
                    tracker.init(img, gt_bbox_)
                    pred_bbox = gt_bbox_
                    scores.append(None)
                    if 'VOT2018-LT' == args.dataset: 
                        pred_bboxes.append([1]) 
                    else: 
                        pred_bboxes.append(pred_bbox)
                else: 
                    outputs = tracker.track(img)
                    pred_bbox = outputs['bbox']
                    pred_bboxes.append(pred_bbox)
                    #scores.append(outputs['best_score'])  
                toc += cv2.getTickCount() - tic
                track_times.append((cv2.getTickCount() - tic)/cv2.getTickFrequency())
                if idx == 0:
                    cv2.destroyAllWindows()
                if args.vis and idx > 0: 
                    gt_bbox = list(map(int, gt_bbox))
                    pred_bbox = list(map(int, pred_bbox))
                    cv2.rectangle(img, (gt_bbox[0], gt_bbox[1]),
                                (gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3]), (0, 255, 0), 3)
                    cv2.rectangle(img, (pred_bbox[0], pred_bbox[1]),
                                (pred_bbox[0]+pred_bbox[2], pred_bbox[1]+pred_bbox[3]), (0, 255, 255), 3)
                    cv2.putText(img, str(idx), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
                    cv2.imshow(video.name, img)
                    cv2.waitKey(1)
            toc /= cv2.getTickFrequency()

            # save results 
            if 'VOT2018-LT' == args.dataset: 
                video_path = os.path.join(args.save_path, args.dataset, args.tracker_name,
                        'longterm', video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path,
                        '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_001_confidence.value'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in scores:
                        f.write('\n') if x is None else f.write("{:.6f}\n".format(x))
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            elif 'GOT-10k' == args.dataset:
                video_path = os.path.join(args.save_path, args.dataset, args.tracker_name, video.name)
                if not os.path.isdir(video_path):
                    os.makedirs(video_path)
                result_path = os.path.join(video_path, '{}_001.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')
                result_path = os.path.join(video_path,
                        '{}_time.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in track_times:
                        f.write("{:.6f}\n".format(x))
            else:
                model_path = os.path.join(args.save_path, args.dataset, args.tracker_name)
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                result_path = os.path.join(model_path, '{}.txt'.format(video.name))
                with open(result_path, 'w') as f:
                    for x in pred_bboxes:
                        f.write(','.join([str(i) for i in x])+'\n')   
    eval(args)