Beispiel #1
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']

    start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            if mask_enable:
                location = state['ploygon'].flatten()
                mask = state['mask']
            else:
                location = cxy_wh_2_rect(state['target_pos'],
                                         state['target_sz'])
                mask = []

            if 'VOT' in args.dataset:
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                if mask_enable:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                else:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:
                b_overlap = 1

            if b_overlap:
                regions.append(location)
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            im_show = im.copy()
            if f == 0: cv2.destroyAllWindows()
            if gt.shape[0] > f:
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            if len(location) == 8:
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    # save result
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]

    if 'VOT' in args.dataset:
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
                        fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    else:  # OTB
        video_path = join('test', args.dataset, name)
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')

    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))

    return lost_times, f / toc
Beispiel #2
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    """
    对目标进行追踪
    :param model: 训练好的模型
    :param video: 视频数据
    :param hp: 超参数
    :param mask_enable: 是否生成掩膜,默认为False
    :param refine_enable: 是否使用融合后的模型
    :param device:硬件信息
    :return:目标跟丢次数,fps
    """
    # 记录目标框及其状态
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    # 获取要处理的图像,和真实值groundtruth
    image_files, gt = video['image_files'], video['gt']
    # 设置相关参数:初始帧,终止帧,目标丢失次数,toc
    start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0
    # 遍历要处理的图像
    for f, image_file in enumerate(image_files):
        # 读取图像
        im = cv2.imread(image_file)
        tic = cv2.getTickCount()
        # 若为初始帧图像
        if f == start_frame:  # init
            # 获取目标区域的位置:中心点坐标,宽,高
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            # 目标位置
            target_pos = np.array([cx, cy])
            # 目标大小
            target_sz = np.array([w, h])
            # 初始化跟踪器
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker
            # 将目标框转换为:左上角坐标,宽,高的形式
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            # 若数据集是VOT,在regions中添加1,否则添加gt[f],第一帧目标的真实位置
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        # 非初始帧数据
        elif f > start_frame:  # tracking
            # 进行目标追踪
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            # 若进行掩膜处理
            if mask_enable:
                # 将跟踪结果铺展开
                location = state['ploygon'].flatten()
                # 获得掩码
                mask = state['mask']
            # 不进行掩膜处理
            else:
                # 将目标框表示形式转换为:左上角坐标,宽,高的形式
                location = cxy_wh_2_rect(state['target_pos'],
                                         state['target_sz'])
                # 掩膜为空
                mask = []
            # 如果是VOT数据,计算交叠程度,其他数据默认交叠为1
            if 'VOT' in args.dataset:
                # 目标的真实位置
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                # 若进行掩膜处理
                if mask_enable:
                    # 预测结果为:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                # 若不进行掩膜
                else:
                    # 预测结果为:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))
                # 计算两个目标之间的交叠程度
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:
                b_overlap = 1
            # 如果跟踪框和真实框有交叠,添加跟踪结果中
            if b_overlap:
                regions.append(location)
            # 如果跟丢,则记录跟丢次数,五帧后重新进行目标初始化
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        # 其他帧数据跳过(比如小于初始帧的数据)
        else:  # skip
            regions.append(0)
        # 计算跟踪时间
        toc += cv2.getTickCount() - tic
        # 如果进行显示并且跳过丢失的帧数据
        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            # 复制原图像的副本
            im_show = im.copy()
            # 如果帧数为0,销毁窗口
            if f == 0: cv2.destroyAllWindows()
            # 标注信息中包含第f帧的结果时:
            if gt.shape[0] > f:
                # 将标准的真实信息绘制在图像上
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            # 将跟踪结果绘制在图像上
            if len(location) == 8:
                # 若进行掩膜处理,将掩膜结果绘制在图像上
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    # 结果保存到文本文件中 save result
    # 文件夹名称:包括模型结构、mask、refine、resume信息
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]
    # 如果是VOT数据集
    if 'VOT' in args.dataset:
        # 构建追踪结果存储位置
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        # 若不存在该路径,进行创建
        if not isdir(video_path): makedirs(video_path)
        # 文本文件的路径
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        # 将追踪结果写入文本文件中
        # with open(result_path, "w") as fin:
        #     for x in regions:
        #         fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
        #                 fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    # 如果是OTB数据
    else:  # OTB
        # 构建存储路径
        video_path = join('test', args.dataset, name)
        # 若不存在该路径,进行创建
        if not isdir(video_path): makedirs(video_path)
        # 文本文件的路径
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        # 将追踪结果写入文本文件中
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')
    # 将信息写入到log文件中
    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))
    # 返回结果
    return lost_times, f / toc
Beispiel #3
0
def tune(param):
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    # save result
    benchmark_result_path = join('result', param['dataset'])
    tracker_path = join(benchmark_result_path, (param['network_name'] +
                                                '_r{}'.format(param['hp']['instance_size']) +
                                                '_penalty_k_{:.3f}'.format(param['hp']['penalty_k']) +
                                                '_window_influence_{:.3f}'.format(param['hp']['window_influence']) +
                                                '_lr_{:.3f}'.format(param['hp']['lr'])).replace('.', '_'))  # no .
    if param['dataset'].startswith('VOT'):
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, param['video'])
        result_path = join(video_path, param['video'] + '_001.txt')
    elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'):
        video_path = tracker_path
        result_path = join(video_path, param['video']+'.txt')

    if isfile(result_path):
        return

    try:
        if not isdir(video_path):
            makedirs(video_path)
    except OSError as err:
        print(err)

    with open(result_path, 'w') as f:  # Occupation
        f.write('Occ')
    
    global ims, gt, image_files
    if ims is None:
        print(param['video'] + '  Only load image once and if needed')
        ims = [cv2.imread(x) for x in image_files]
    start_frame, lost_times, toc = 0, 0, 0
    for f, im in enumerate(ims):
        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, param['network'], param['hp'], device=device)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            if param['dataset'].startswith('VOT'):
                regions.append(1)
            elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'):
                regions.append(gt[f])
        elif f > start_frame:  # tracking
            state = siamese_track(state, im, args.mask, args.refine, device=device)
            if args.mask:
                location = state['ploygon'].flatten()
            else:
                location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            if param['dataset'].startswith('VOT'):
                if 'VOT' in args.dataset:
                    gt_polygon = ((gt[f][0], gt[f][1]), 
                                  (gt[f][2], gt[f][3]),
                                  (gt[f][4], gt[f][5]), 
                                  (gt[f][6], gt[f][7]))
                    if args.mask:
                        pred_polygon = ((location[0], location[1]), (location[2], location[3]),
                                        (location[4], location[5]), (location[6], location[7]))
                    else:
                        pred_polygon = ((location[0], location[1]),
                                        (location[0] + location[2], location[1]),
                                        (location[0] + location[2], location[1] + location[3]),
                                        (location[0], location[1] + location[3]))
                    b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0]))
                else:
                    b_overlap = 1

                if b_overlap:  # continue to track
                    regions.append(location)
                else:  # lost
                    regions.append(2)
                    lost_times += 1
                    start_frame = f + 5  # skip 5 frames
            else:
                regions.append(location)
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            if f == 0: cv2.destroyAllWindows()
            if len(gt[f]) == 8:
                cv2.polylines(im, [np.array(gt[f], np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3)
            else:
                cv2.rectangle(im, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3)
            if len(location) == 8:
                location = np.int0(location)
                cv2.polylines(im, [location.reshape((-1, 1, 2))], True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]  # bad support for OPENCV
                cv2.rectangle(im, (location[0], location[1]),
                              (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3)
            cv2.putText(im, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)  # frame id
            cv2.putText(im, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)  # lost time

            cv2.imshow(param['video'], im)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(param['video'], toc, f / toc, lost_times))

    with open(result_path, 'w') as f:
        for x in regions:
            f.write('{:d}\n'.format(x)) if isinstance(x, int) else \
                    f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
Beispiel #4
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    #regions记录目标框以及状态。
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video[
        'gt']  #gt是groundtruth(325, 8),数组,img_files是一个视频中所有路径构成的列表
    start_frame, end_frame, lost_times, toc = 0, len(
        image_files), 0, 0  #len(image_files)是视频帧数,每个视频不同

    #遍历当前视频中的所有图像,f是索引,image_file是单个帧的路径,由目标出现的帧初始化。
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)  #(h*w*3)
        tic = cv2.getTickCount()  #记录当前时间
        if f == start_frame:  # init初始化,如果是第一帧
            cx, cy, w, h = get_axis_aligned_bbox(
                gt[f])  #将gt中任意方向的矩形(标识目标)转换成轴对称的矩形
            target_pos = np.array([cx, cy])  #轴对称矩形中心
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker初始化跟踪器
            #输入是一帧图像数据,gt的(cx,cy)和(w,h), model, hp, device
            location = cxy_wh_2_rect(
                state['target_pos'], state['target_sz']
            )  #得到的location为轴对称矩形左上角坐标(x,y,w,h)ndarray<(4,), float64>
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking ,接下来的所有帧,在后续帧跟踪,由state获取目标框。
            if f == 3:
                exit()
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            if mask_enable:
                location = state['ploygon'].flatten()
                mask = state['mask']  #mask.shape和原图一样
            else:
                location = cxy_wh_2_rect(
                    state['target_pos'],
                    state['target_sz'])  #解码出的预测框左上角(x,y),(w,h)
                mask = []

            if 'VOT' in args.dataset:
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                if mask_enable:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                else:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))


#                无论怎样locatioon都是预测得到的,计算预测和实际的多边形之间的重叠
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:  #vot_overlap 计算两个多边形gt_polygon, pred_polygon之间的重叠。
                b_overlap = 1

            if b_overlap:  #值为真,有重叠
                regions.append(location)
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        #处理完one video后进行显示
        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            im_show = im.copy()
            if f == 0: cv2.destroyAllWindows()
            if gt.shape[0] > f:
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            if len(location) == 8:
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    #    下面是后续处理工作
    # save result跟踪完成,记录结果到文本文件。
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]

    if 'VOT' in args.dataset:
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
                        fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    else:  # OTB
        video_path = join('test', args.dataset, name)
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')

    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))

    return lost_times, f / toc