Example #1
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video['gt']

    start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            if mask_enable:
                location = state['ploygon'].flatten()
                mask = state['mask']
            else:
                location = cxy_wh_2_rect(state['target_pos'],
                                         state['target_sz'])
                mask = []

            if 'VOT' in args.dataset:
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                if mask_enable:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                else:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:
                b_overlap = 1

            if b_overlap:
                regions.append(location)
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            im_show = im.copy()
            if f == 0: cv2.destroyAllWindows()
            if gt.shape[0] > f:
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            if len(location) == 8:
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    # save result
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]

    if 'VOT' in args.dataset:
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
                        fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    else:  # OTB
        video_path = join('test', args.dataset, name)
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')

    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))

    return lost_times, f / toc
Example #2
0
def tune(param):
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    # save result
    benchmark_result_path = join('result', param['dataset'])
    tracker_path = join(benchmark_result_path, (param['network_name'] +
                                                '_r{}'.format(param['hp']['instance_size']) +
                                                '_penalty_k_{:.3f}'.format(param['hp']['penalty_k']) +
                                                '_window_influence_{:.3f}'.format(param['hp']['window_influence']) +
                                                '_lr_{:.3f}'.format(param['hp']['lr'])).replace('.', '_'))  # no .
    if param['dataset'].startswith('VOT'):
        baseline_path = join(tracker_path, 'baseline')
        video_path = join(baseline_path, param['video'])
        result_path = join(video_path, param['video'] + '_001.txt')
    elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'):
        video_path = tracker_path
        result_path = join(video_path, param['video']+'.txt')

    if isfile(result_path):
        return

    try:
        if not isdir(video_path):
            makedirs(video_path)
    except OSError as err:
        print(err)

    with open(result_path, 'w') as f:  # Occupation
        f.write('Occ')
    
    global ims, gt, image_files
    if ims is None:
        print(param['video'] + '  Only load image once and if needed')
        ims = [cv2.imread(x) for x in image_files]
    start_frame, lost_times, toc = 0, 0, 0
    for f, im in enumerate(ims):
        tic = cv2.getTickCount()
        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, param['network'], param['hp'], device=device)  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            if param['dataset'].startswith('VOT'):
                regions.append(1)
            elif param['dataset'].startswith('OTB') or param['dataset'].startswith('DAVIS'):
                regions.append(gt[f])
        elif f > start_frame:  # tracking
            state = siamese_track(state, im, args.mask, args.refine, device=device)
            if args.mask:
                location = state['ploygon'].flatten()
            else:
                location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            if param['dataset'].startswith('VOT'):
                if 'VOT' in args.dataset:
                    gt_polygon = ((gt[f][0], gt[f][1]), 
                                  (gt[f][2], gt[f][3]),
                                  (gt[f][4], gt[f][5]), 
                                  (gt[f][6], gt[f][7]))
                    if args.mask:
                        pred_polygon = ((location[0], location[1]), (location[2], location[3]),
                                        (location[4], location[5]), (location[6], location[7]))
                    else:
                        pred_polygon = ((location[0], location[1]),
                                        (location[0] + location[2], location[1]),
                                        (location[0] + location[2], location[1] + location[3]),
                                        (location[0], location[1] + location[3]))
                    b_overlap = vot_overlap(gt_polygon, pred_polygon, (im.shape[1], im.shape[0]))
                else:
                    b_overlap = 1

                if b_overlap:  # continue to track
                    regions.append(location)
                else:  # lost
                    regions.append(2)
                    lost_times += 1
                    start_frame = f + 5  # skip 5 frames
            else:
                regions.append(location)
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            if f == 0: cv2.destroyAllWindows()
            if len(gt[f]) == 8:
                cv2.polylines(im, [np.array(gt[f], np.int).reshape((-1, 1, 2))], True, (0, 255, 0), 3)
            else:
                cv2.rectangle(im, (gt[f, 0], gt[f, 1]), (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]), (0, 255, 0), 3)
            if len(location) == 8:
                location = np.int0(location)
                cv2.polylines(im, [location.reshape((-1, 1, 2))], True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]  # bad support for OPENCV
                cv2.rectangle(im, (location[0], location[1]),
                              (location[0] + location[2], location[1] + location[3]), (0, 255, 255), 3)
            cv2.putText(im, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)  # frame id
            cv2.putText(im, str(lost_times), (40, 80), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)  # lost time

            cv2.imshow(param['video'], im)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps Lost: {:d}'.format(param['video'], toc, f / toc, lost_times))

    with open(result_path, 'w') as f:
        for x in regions:
            f.write('{:d}\n'.format(x)) if isinstance(x, int) else \
                    f.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
Example #3
0
def siamese_track(state,
                  im,
                  mask_enable=False,
                  refine_enable=False,
                  device='cpu',
                  debug=False):
    p = state['p']
    net = state['net']
    avg_chans = state['avg_chans']
    window = state['window']
    target_pos = state['target_pos']
    target_sz = state['target_sz']

    wc_x = target_sz[1] + p.context_amount * sum(target_sz)
    hc_x = target_sz[0] + p.context_amount * sum(target_sz)
    s_x = np.sqrt(wc_x * hc_x)
    scale_x = p.exemplar_size / s_x
    d_search = (p.instance_size - p.exemplar_size) / 2
    pad = d_search / scale_x
    s_x = s_x + 2 * pad
    crop_box = [
        target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2,
        round(s_x),
        round(s_x)
    ]

    if debug:
        im_debug = im.copy()
        crop_box_int = np.int0(crop_box)
        cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]),
                      (crop_box_int[0] + crop_box_int[2],
                       crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2)
        cv2.imshow('search area', im_debug)
        cv2.waitKey(0)

    # extract scaled crops for search region x at previous target position
    x_crop = Variable(
        get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x),
                               avg_chans).unsqueeze(0))

    if mask_enable:
        score, delta, mask = net.track_mask(x_crop.to(device))
    else:
        score, delta = net.track(x_crop.to(device))

    delta = delta.permute(1, 2, 3, 0).contiguous().view(4,
                                                        -1).data.cpu().numpy()
    score = F.softmax(score.permute(1, 2, 3,
                                    0).contiguous().view(2, -1).permute(1, 0),
                      dim=1).data[:, 1].cpu().numpy()

    delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]
    delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1]
    delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]
    delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]

    def change(r):
        return np.maximum(r, 1. / r)

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    target_sz_in_crop = target_sz * scale_x
    s_c = change(sz(delta[2, :], delta[3, :]) /
                 (sz_wh(target_sz_in_crop)))  # scale penalty
    r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) /
                 (delta[2, :] / delta[3, :]))  # ratio penalty

    penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k)
    pscore = penalty * score

    # cos window (motion model)
    pscore = pscore * (1 - p.window_influence) + window * p.window_influence
    best_pscore_id = np.argmax(pscore)

    pred_in_crop = delta[:, best_pscore_id] / scale_x
    lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr  # lr for OTB

    res_x = pred_in_crop[0] + target_pos[0]
    res_y = pred_in_crop[1] + target_pos[1]

    res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr
    res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr

    target_pos = np.array([res_x, res_y])
    target_sz = np.array([res_w, res_h])

    # for Mask Branch
    if mask_enable:
        best_pscore_id_mask = np.unravel_index(best_pscore_id,
                                               (5, p.score_size, p.score_size))
        delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1]

        if refine_enable:
            mask = net.track_refine(
                (delta_y, delta_x)).to(device).sigmoid().squeeze().view(
                    p.out_size, p.out_size).cpu().data.numpy()
        else:
            mask = mask[0, :, delta_y, delta_x].sigmoid(). \
                squeeze().view(p.out_size, p.out_size).cpu().data.numpy()

        def crop_back(image, bbox, out_sz, padding=-1):
            a = (out_sz[0] - 1) / bbox[2]
            b = (out_sz[1] - 1) / bbox[3]
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            crop = cv2.warpAffine(image,
                                  mapping, (out_sz[0], out_sz[1]),
                                  flags=cv2.INTER_LINEAR,
                                  borderMode=cv2.BORDER_CONSTANT,
                                  borderValue=padding)
            return crop

        s = crop_box[2] / p.instance_size
        sub_box = [
            crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s,
            crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s,
            s * p.exemplar_size, s * p.exemplar_size
        ]
        s = p.out_size / sub_box[2]
        back_box = [
            -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s,
            state['im_h'] * s
        ]
        mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h']))

        target_mask = (mask_in_img > p.seg_thr).astype(np.uint8)
        if cv2.__version__[-5] == '4':
            contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_NONE)
        else:
            _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                              cv2.CHAIN_APPROX_NONE)
        cnt_area = [cv2.contourArea(cnt) for cnt in contours]
        if len(contours) != 0 and np.max(cnt_area) > 100:
            contour = contours[np.argmax(cnt_area)]  # use max area polygon
            polygon = contour.reshape(-1, 2)
            # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
            prbox = cv2.boxPoints(
                cv2.minAreaRect(polygon))  # Rotated Rectangle

            # box_in_img = pbox
            rbox_in_img = prbox
        else:  # empty mask
            location = cxy_wh_2_rect(target_pos, target_sz)
            rbox_in_img = np.array(
                [[location[0], location[1]],
                 [location[0] + location[2], location[1]],
                 [location[0] + location[2], location[1] + location[3]],
                 [location[0], location[1] + location[3]]])

    target_pos[0] = max(0, min(state['im_w'], target_pos[0]))
    target_pos[1] = max(0, min(state['im_h'], target_pos[1]))
    target_sz[0] = max(10, min(state['im_w'], target_sz[0]))
    target_sz[1] = max(10, min(state['im_h'], target_sz[1]))

    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    state['score'] = score[best_pscore_id]
    state['mask'] = mask_in_img if mask_enable else []
    state['ploygon'] = rbox_in_img if mask_enable else []
    return state
Example #4
0
def tune(param):
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    # save result
    benchmark_result_path = join('result', param['dataset'])
    tracker_path = join(
        benchmark_result_path,
        (param['network_name'] + ('_refine' if args.refine else '') +
         '_r{}'.format(param['hp']['instance_size']) +
         '_penalty_k_{:.3f}'.format(param['hp']['penalty_k']) +
         '_window_influence_{:.3f}'.format(param['hp']['window_influence']) +
         '_lr_{:.3f}'.format(param['hp']['lr'])).replace('.', '_'))  # no .
    video_path = tracker_path
    result_path = join(video_path, param['video'] + '.txt')

    if isfile(result_path):
        return

    try:
        if not isdir(video_path):
            makedirs(video_path)
    except OSError as err:
        print(err)

    with open(result_path, 'w') as f:  # Occupation
        f.write('Occ')

    global ims, gt, annos, image_files, anno_files
    if ims is None:
        print(param['video'] + '  Only load image once and if needed')
        ims = [cv2.imread(x) for x in image_files]
        annos = [np.array(Image.open(x)) for x in anno_files]

    iou = IouMeter(thrs, len(ims) - 2)
    start_frame, end_frame, toc = 0, len(ims) - 1, 0
    for f, (im, anno) in enumerate(zip(ims, annos)):
        tic = cv2.getTickCount()
        if f == start_frame:  # init
            target_pos = np.array(
                [gt[f, 0] + gt[f, 2] / 2, gt[f, 1] + gt[f, 3] / 2])
            target_sz = np.array([gt[f, 2], gt[f, 3]])
            state = siamese_init(im, target_pos, target_sz, param['network'],
                                 param['hp'])  # init tracker
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            regions.append(gt[f])
        elif f > start_frame:  # tracking
            state = siamese_track(state, im, args.mask, args.refine)  # track
            location = state['ploygon'].flatten()
            mask = state['mask']

            regions.append(location)
        if start_frame < f < end_frame: iou.add(mask, anno)

        toc += cv2.getTickCount() - tic

        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            im_show = im.copy()
            if f == 0: cv2.destroyAllWindows()
            if len(gt[f]) == 8:
                cv2.polylines(im_show,
                              [np.array(gt[f], np.int).reshape(
                                  (-1, 1, 2))], True, (0, 255, 0), 3)
            else:
                cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                              (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                              (0, 255, 0), 3)
            if len(location) == 8:
                im_show[:, :, 2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                cv2.polylines(im_show, [np.int0(location).reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]  # bad support for OPENCV
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)  # frame id

            cv2.imshow(param['video'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()
    iou_list = iou.value('mean')
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps IOU: {:.3f}'.format(
        param['video'], toc, f / toc, iou_list.max()))

    with open(result_path, 'w') as f:
        f.write(','.join(["%.5f" % i for i in iou_list]) + '\n')

    return iou_list
Example #5
0
def siamese_track(state, im):

    refine_enable = True
    mask_enable = True
    device = 'cpu'
    debug = True
    p = state['p']
    net = state['net']
    avg_chans = state['avg_chans']
    window = state['window']
    detector = state["detector"]
    custom_objects = detector.CustomObjects(car=True, person=True)
    targets = state["targets"]

    zf_lists = []

    # s_z = [ round(np.sqrt(target["target_sz"][1] + 0.123 * sum(target["target_sz"])*target["target_sz"][0] + 0.123 * sum(target["target_sz"]) ))  for target in targets ]
    # s_z = np.array(s_z)
    # scale_x = p.exemplar_size / s_z
    # d_search = (p.instance_size - p.exemplar_size) / 2

    BLUE = [255, 255, 255]
    for i, target in enumerate(targets):

        wc_x = target["target_sz"][1] + p.context_amount * sum(
            target["target_sz"])
        hc_x = target["target_sz"][0] + p.context_amount * sum(
            target["target_sz"])
        target["s_z"] = np.sqrt(wc_x * hc_x)

        target["scale_x"] = p.exemplar_size / target["s_z"]
        d_search = (p.instance_size - p.exemplar_size) / 2
        pad = d_search / target["scale_x"]
        target["s_z"] = target["s_z"] + 2 * pad
        target["crop_box"] = [
            target["target_pos"][0] - round(target["s_z"]) / 2,
            target["target_pos"][1] - round(target["s_z"]) / 2,
            round(target["s_z"]),
            round(target["s_z"])
        ]
        zf_lists.append(target["zf"])
        crop_box = target["crop_box"]

    #    if debug:
    #     im_debug = im.copy()
    #     crop_box_int = np.int0(crop_box)
    #     cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]),
    #                     (crop_box_int[0] + crop_box_int[2], crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2)
    #     cv2.imshow('search area', im_debug)
    #     cv2.waitKey(1)

    # extract scaled crops for search region x at previous target position

    targets = get_subwindow_tracking(im,
                                     p.instance_size,
                                     avg_chans,
                                     targets=targets)

    # x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0))
    tracking_data_list = []
    tracking_data = dict()

    for target, zf in zip(targets, zf_lists):

        target["x_crop"] = Variable(target["im_to_torch"].unsqueeze(0))
        target["x_crop"] = target["x_crop"].to(device)
        tracking_data_list.append({"x_crop": target["x_crop"], "zf": zf})

    if mask_enable:
        results = net.track_mask(search=targets[0]["x_crop"],
                                 lists=tracking_data_list)
        # score, delta, mask = net.track_mask(search=targets[0]["x_crop"],lists=tracking_data_list)

    else:
        score, delta = net.track(x_crop.to(device))

    for result in results:
        delta = result["rpn_pred_loc"]
        score = result["rpn_pred_cls"]
        delta = delta.permute(1, 2, 3,
                              0).contiguous().view(4, -1).data.cpu().numpy()
        score = F.softmax(score.permute(1, 2, 3,
                                        0).contiguous().view(2,
                                                             -1).permute(1, 0),
                          dim=1).data[:, 1].cpu().numpy()

        delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]
        delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1]
        delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]
        delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]
        result["rpn_pred_loc"] = delta
        result["rpn_pred_cls"] = score

    def change(r):
        return np.maximum(r, 1. / r)

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    count = 0
    for target, result in zip(targets, results):
        delta = result["rpn_pred_loc"]
        score = result["rpn_pred_cls"]
        crop_box = target["crop_box"]

        target_sz_in_crop = target["target_sz"] * target["scale_x"]
        s_c = change(
            sz(delta[2, :], delta[3, :]) /
            (sz_wh(target_sz_in_crop)))  # scale penalty
        r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) /
                     (delta[2, :] / delta[3, :]))  # ratio penalty
        penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k)
        pscore = penalty * score

        pscore = pscore * (1 -
                           p.window_influence) + window * p.window_influence
        best_pscore_id = np.argmax(pscore)

        pred_in_crop = delta[:, best_pscore_id] / target["scale_x"]
        lr = penalty[best_pscore_id] * score[
            best_pscore_id] * p.lr  # lr for OTB

        res_x = pred_in_crop[0] + target["target_pos"][0]
        res_y = pred_in_crop[1] + target["target_pos"][1]

        res_w = target["target_sz"][0] * (1 - lr) + pred_in_crop[2] * lr
        res_h = target["target_sz"][1] * (1 - lr) + pred_in_crop[3] * lr

        target["target_pos"] = np.array([res_x, res_y])
        target["target_sz"] = np.array([res_w, res_h])

        if mask_enable:
            best_pscore_id_mask = np.unravel_index(
                best_pscore_id, (5, p.score_size, p.score_size))
            delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1]

            if refine_enable:
                mask = net.track_refine(
                    (delta_y, delta_x),
                    index=count).to(device).sigmoid().squeeze().view(
                        p.out_size, p.out_size).cpu().data.numpy()
            else:
                mask = mask[0, :, delta_y, delta_x].sigmoid(). \
                    squeeze().view(p.out_size, p.out_size).cpu().data.numpy()

            count += 1

            def crop_back(image, bbox, out_sz, padding=-1):
                a = (out_sz[0] - 1) / bbox[2]
                b = (out_sz[1] - 1) / bbox[3]
                c = -a * bbox[0]
                d = -b * bbox[1]
                mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
                crop = cv2.warpAffine(image,
                                      mapping, (out_sz[0], out_sz[1]),
                                      flags=cv2.INTER_LINEAR,
                                      borderMode=cv2.BORDER_CONSTANT,
                                      borderValue=padding)
                return crop

            s = crop_box[2] / p.instance_size
            sub_box = [
                crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s,
                crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s,
                s * p.exemplar_size, s * p.exemplar_size
            ]
            s = p.out_size / sub_box[2]
            back_box = [
                -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s,
                state['im_h'] * s
            ]
            mask_in_img = crop_back(mask, back_box,
                                    (state['im_w'], state['im_h']))

            target_mask = (mask_in_img > p.seg_thr).astype(np.uint8)
            if cv2.__version__[-5] == '4':
                contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                               cv2.CHAIN_APPROX_NONE)
            else:
                _, contours, _ = cv2.findContours(target_mask,
                                                  cv2.RETR_EXTERNAL,
                                                  cv2.CHAIN_APPROX_NONE)
            cnt_area = [cv2.contourArea(cnt) for cnt in contours]

            if len(contours) != 0 and np.max(cnt_area) > 100:
                contour = contours[np.argmax(cnt_area)]  # use max area polygon
                polygon = contour.reshape(-1, 2)
                # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
                prbox = cv2.boxPoints(
                    cv2.minAreaRect(polygon))  # Rotated Rectangle

                # box_in_img = pbox
                rbox_in_img = prbox
            else:  # empty mask
                location = cxy_wh_2_rect(target["target_pos"],
                                         target["target_sz"])
                rbox_in_img = np.array(
                    [[location[0], location[1]],
                     [location[0] + location[2], location[1]],
                     [location[0] + location[2], location[1] + location[3]],
                     [location[0], location[1] + location[3]]])

        target["target_pos"][0] = max(
            0, min(state['im_w'], target["target_pos"][0]))
        target["target_pos"][1] = max(
            0, min(state['im_h'], target["target_pos"][1]))
        target["target_sz"][0] = max(
            10, min(state['im_w'], target["target_sz"][0]))
        target["target_sz"][1] = max(
            10, min(state['im_h'], target["target_sz"][1]))
        # print("new targetPos {} and targetsize {} \n".format(target["target_pos"],target["target_sz"]))

        target["mask"] = mask_in_img if mask_enable else []
        target['ploygon'] = rbox_in_img if mask_enable else []
        target["score"] = score[best_pscore_id]

    state["targets"] = targets
    # state['target_pos'] = target_pos
    # state['target_sz'] = target_sz
    # state['score'] = score[best_pscore_id]
    # state['mask'] = mask_in_img if mask_enable else []
    # state['ploygon'] = rbox_in_img if mask_enable else []
    return state
Example #6
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    """
    对目标进行追踪
    :param model: 训练好的模型
    :param video: 视频数据
    :param hp: 超参数
    :param mask_enable: 是否生成掩膜,默认为False
    :param refine_enable: 是否使用融合后的模型
    :param device:硬件信息
    :return:目标跟丢次数,fps
    """
    # 记录目标框及其状态
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    # 获取要处理的图像,和真实值groundtruth
    image_files, gt = video['image_files'], video['gt']
    # 设置相关参数:初始帧,终止帧,目标丢失次数,toc
    start_frame, end_frame, lost_times, toc = 0, len(image_files), 0, 0
    # 遍历要处理的图像
    for f, image_file in enumerate(image_files):
        # 读取图像
        im = cv2.imread(image_file)
        tic = cv2.getTickCount()
        # 若为初始帧图像
        if f == start_frame:  # init
            # 获取目标区域的位置:中心点坐标,宽,高
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            # 目标位置
            target_pos = np.array([cx, cy])
            # 目标大小
            target_sz = np.array([w, h])
            # 初始化跟踪器
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker
            # 将目标框转换为:左上角坐标,宽,高的形式
            location = cxy_wh_2_rect(state['target_pos'], state['target_sz'])
            # 若数据集是VOT,在regions中添加1,否则添加gt[f],第一帧目标的真实位置
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        # 非初始帧数据
        elif f > start_frame:  # tracking
            # 进行目标追踪
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            # 若进行掩膜处理
            if mask_enable:
                # 将跟踪结果铺展开
                location = state['ploygon'].flatten()
                # 获得掩码
                mask = state['mask']
            # 不进行掩膜处理
            else:
                # 将目标框表示形式转换为:左上角坐标,宽,高的形式
                location = cxy_wh_2_rect(state['target_pos'],
                                         state['target_sz'])
                # 掩膜为空
                mask = []
            # 如果是VOT数据,计算交叠程度,其他数据默认交叠为1
            if 'VOT' in args.dataset:
                # 目标的真实位置
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                # 若进行掩膜处理
                if mask_enable:
                    # 预测结果为:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                # 若不进行掩膜
                else:
                    # 预测结果为:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))
                # 计算两个目标之间的交叠程度
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:
                b_overlap = 1
            # 如果跟踪框和真实框有交叠,添加跟踪结果中
            if b_overlap:
                regions.append(location)
            # 如果跟丢,则记录跟丢次数,五帧后重新进行目标初始化
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        # 其他帧数据跳过(比如小于初始帧的数据)
        else:  # skip
            regions.append(0)
        # 计算跟踪时间
        toc += cv2.getTickCount() - tic
        # 如果进行显示并且跳过丢失的帧数据
        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            # 复制原图像的副本
            im_show = im.copy()
            # 如果帧数为0,销毁窗口
            if f == 0: cv2.destroyAllWindows()
            # 标注信息中包含第f帧的结果时:
            if gt.shape[0] > f:
                # 将标准的真实信息绘制在图像上
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            # 将跟踪结果绘制在图像上
            if len(location) == 8:
                # 若进行掩膜处理,将掩膜结果绘制在图像上
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    # 结果保存到文本文件中 save result
    # 文件夹名称:包括模型结构、mask、refine、resume信息
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]
    # 如果是VOT数据集
    if 'VOT' in args.dataset:
        # 构建追踪结果存储位置
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        # 若不存在该路径,进行创建
        if not isdir(video_path): makedirs(video_path)
        # 文本文件的路径
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        # 将追踪结果写入文本文件中
        # with open(result_path, "w") as fin:
        #     for x in regions:
        #         fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
        #                 fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    # 如果是OTB数据
    else:  # OTB
        # 构建存储路径
        video_path = join('test', args.dataset, name)
        # 若不存在该路径,进行创建
        if not isdir(video_path): makedirs(video_path)
        # 文本文件的路径
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        # 将追踪结果写入文本文件中
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')
    # 将信息写入到log文件中
    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))
    # 返回结果
    return lost_times, f / toc
Example #7
0
def siamese_track(state,
                  im,
                  mask_enable=False,
                  refine_enable=False,
                  device='cpu',
                  debug=False):
    global arrendatario
    p = state['p']
    net = state['net']
    avg_chans = state['avg_chans']
    window = state['window']
    target_pos = state['target_pos']
    target_sz = state['target_sz']
    print(im.shape)
    wc_x = target_sz[1] + p.context_amount * sum(target_sz)
    hc_x = target_sz[0] + p.context_amount * sum(target_sz)
    s_x = np.sqrt(wc_x * hc_x)
    scale_x = p.exemplar_size / s_x  # p.exemplar_size = 127, sempre es la mateixa
    d_search = (p.instance_size - p.exemplar_size) / 2
    pad = d_search / scale_x
    s_x = s_x + 2 * pad
    crop_box = [
        target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2,
        round(s_x),
        round(s_x)
    ]

    debug = True
    if debug:
        im_debug = im.copy()
        crop_box_int = np.int0(crop_box)
        cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]),
                      (crop_box_int[0] + crop_box_int[2],
                       crop_box_int[1] + crop_box_int[3]), (255, 255, 0), 2)
        # cv2.imwrite('/data/Ponc/tracking/results/windows-seagulls-debug/'+'search_'+str(arrendatario)+'.jpeg', im_debug)
        cv2.waitKey(0)

    # extract scaled crops for search region x at previous target position
    x_crop = Variable(
        get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x),
                               avg_chans).unsqueeze(0))
    # In davis we have 5 anchors
    if mask_enable:
        score, delta, mask = net.track_mask(
            x_crop.to(device)
        )  # score: (1,10,25,25), delta: (1, 20 (5boxes*4coords), 25, 25), mask: (1, 63*63, 25, 25)
    else:
        score, delta = net.track(x_crop.to(device))

    delta = delta.permute(1, 2, 3, 0).contiguous().view(4,
                                                        -1).data.cpu().numpy()

    # Softmax in 3125,2,which each column is BG, FG
    score = F.softmax(score.permute(1, 2, 3,
                                    0).contiguous().view(2, -1).permute(1, 0),
                      dim=1).data[:, 1].cpu().numpy()

    delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]
    delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1]
    delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]
    delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]

    def change(r):
        return np.maximum(r, 1. / r)

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    target_sz_in_crop = target_sz * scale_x
    s_c = change(sz(delta[2, :], delta[3, :]) /
                 (sz_wh(target_sz_in_crop)))  # scale penalty
    r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) /
                 (delta[2, :] / delta[3, :]))  # ratio penalty

    penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k)
    pscore = penalty * score

    # cos window (motion model)
    N = 39
    bboxes = np.zeros((6, N), dtype=np.float64)
    # bboxes has the shape (6 , Npoints) ; 0=res_x, 1=res_y, 2=res_w, 3=res_h, 4=score, 5=best_pscore_id_tmp
    pscore = pscore * (1 - p.window_influence) + window * p.window_influence
    attmap = score.reshape(5, 25, 25)
    attmap = np.amax(attmap, axis=0)
    # np.save('/data/Ponc/tracking/results/mevasa/'+str(arrendatario)+'.npy', attmap)
    best_score_threshold = 1.1
    for idx in range(0, N):
        if (idx == 0):
            best_pscore_id = np.argmax(pscore)

        best_pscore_id_tmp = np.argmax(pscore)
        pred_in_crop = delta[:, best_pscore_id_tmp] / scale_x

        lr = penalty[best_pscore_id_tmp] * score[
            best_pscore_id_tmp] * p.lr  # lr for OTB

        res_x = pred_in_crop[0] + target_pos[0]
        res_y = pred_in_crop[1] + target_pos[1]
        res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr
        res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr

        target_pos = np.array([res_x, res_y])
        target_sz = np.array([res_w, res_h])

        bboxes[0, idx] = target_pos[0]
        bboxes[1, idx] = target_pos[1]
        bboxes[2, idx] = target_sz[0]
        bboxes[3, idx] = target_sz[1]
        bboxes[4, idx] = pscore[
            best_pscore_id_tmp]  #BUG: This should be pscore[best_...]?
        bboxes[5, idx] = best_pscore_id_tmp
        if (pscore[best_pscore_id] > best_score_threshold):
            break
        pscore[best_pscore_id_tmp] = 0.0

    target_pos = np.array([bboxes[0, 0], bboxes[1, 0]])
    target_sz = np.array([bboxes[2, 0], bboxes[3, 0]])

    # for Mask Branch
    rboxes = []
    deltas = []

    for idx in range(0, N):
        if mask_enable:
            best_pscore_id_mask = np.unravel_index(int(
                bboxes[5, idx]), (5, p.score_size, p.score_size))
            delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[
                1]  # delta_x and delta_y are the selected coordinates in the volume

            if ((delta_x, delta_y) not in deltas):
                print("delta: (", delta_x, ", ", delta_y, ")")
                deltas.append((delta_x, delta_y))
                if refine_enable:
                    mask = net.track_refine(
                        (delta_y,
                         delta_x)).to(device).sigmoid().squeeze().view(
                             p.out_size, p.out_size).cpu().data.numpy()
                else:
                    mask = mask[0, :, delta_y, delta_x].sigmoid(). \
                        squeeze().view(p.out_size, p.out_size).cpu().data.numpy()

                def crop_back(image, bbox, out_sz, padding=-1):
                    a = (out_sz[0] - 1) / bbox[2]
                    b = (out_sz[1] - 1) / bbox[3]
                    c = -a * bbox[0]
                    d = -b * bbox[1]
                    mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
                    crop = cv2.warpAffine(image,
                                          mapping, (out_sz[0], out_sz[1]),
                                          flags=cv2.INTER_LINEAR,
                                          borderMode=cv2.BORDER_CONSTANT,
                                          borderValue=padding)
                    return crop

                s = crop_box[2] / p.instance_size
                sub_box = [
                    crop_box[0] +
                    (delta_x - p.base_size / 2) * p.total_stride * s,
                    crop_box[1] +
                    (delta_y - p.base_size / 2) * p.total_stride * s,
                    s * p.exemplar_size, s * p.exemplar_size
                ]
                s = p.out_size / sub_box[2]
                back_box = [
                    -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s,
                    state['im_h'] * s
                ]
                mask_in_img = crop_back(mask, back_box,
                                        (state['im_w'], state['im_h']))

                target_mask = (mask_in_img > p.seg_thr).astype(np.uint8)
                if cv2.__version__[-5] == '4':
                    contours, _ = cv2.findContours(target_mask,
                                                   cv2.RETR_EXTERNAL,
                                                   cv2.CHAIN_APPROX_NONE)
                else:
                    _, contours, _ = cv2.findContours(target_mask,
                                                      cv2.RETR_EXTERNAL,
                                                      cv2.CHAIN_APPROX_NONE)
                cnt_area = [cv2.contourArea(cnt) for cnt in contours]
                if len(contours) != 0 and np.max(cnt_area) > 100:
                    contour = contours[np.argmax(
                        cnt_area)]  # use max area polygon
                    polygon = contour.reshape(-1, 2)
                    # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
                    prbox = cv2.boxPoints(
                        cv2.minAreaRect(polygon))  # Rotated Rectangle

                    # box_in_img = pbox
                    rbox_in_img = prbox
                    box_score = bboxes[4, idx]
                    rboxes.append([rbox_in_img, box_score])
                    # if(len(deltas) == 1):
                    #     attmap[delta_x, delta_y] = 3.5
                    # else:
                    #     attmap[delta_x, delta_y] = 1.5

                    if (debug):

                        im_debug_overlay = im_debug.copy()
                        im_debug_overlay[:, :, :] = np.array([0.0, 0.0, 0.0])
                        torch_data = np.float64(im_debug_overlay[:, :,
                                                                 0].copy())
                        patch_size = crop_box_int[0]
                        num_deltas = 25  # aixo no varia
                        patch_ratio = int(patch_size /
                                          num_deltas)  # aixo varia
                        resized_img_h, resized_img_w = im_debug.shape[
                            0] / 5, im_debug.shape[1] / 5

                        torch_data_delta_size = np.zeros(
                            (int(resized_img_h), int(resized_img_w)))
                        offset_x_deltas = int(crop_box_int[0] / 5)
                        offset_y_deltas = int(crop_box_int[1] / 5)

                        length_x = int((crop_box_int[2]) / 5)
                        length_y = int((crop_box_int[3]) / 5)

                        for i in range(25):
                            for j in range(25):
                                # step_x = crop_box_int[0] + i*length_x
                                # step_y = crop_box_int[1] + j*length_y
                                # im_debug_overlay[step_y: step_y + length_y, step_x: step_x+length_x, :] = np.array([0.0,0.0,0.0])
                                # im_debug_overlay[step_y: step_y + length_y, step_x: step_x+length_x, :] = np.uint8(attmap[j,i] * np.array([0,165,255]))
                                # torch_data[step_y: step_y + length_y, step_x: step_x+length_x] = attmap[j,i]*1.0
                                # Now for the reshaped
                                torch_data_delta_size[offset_y_deltas + j,
                                                      offset_x_deltas +
                                                      i] = attmap[j, i] * 1.0
                                if (pscore[best_pscore_id] >
                                        best_score_threshold):
                                    torch_data_delta_size[offset_y_deltas +
                                                          delta_y,
                                                          offset_x_deltas +
                                                          delta_x] = 3.0

                        if (pscore[best_pscore_id] > best_score_threshold):
                            sma = torch.nn.Softmax()
                            torch_data_delta_size = sma(
                                torch.from_numpy(
                                    np.exp(torch_data_delta_size))).numpy()

                            # im_debug_overlay[step_x: step_x+length_x, step_y: step_y + length_y, :] = attmap[j,i]

                        overlay_result = cv2.addWeighted(
                            im_debug, 0.70, im_debug_overlay, 0.3, 0.0)
                        # cv2.imwrite('/data/Ponc/tracking/results/windows-seagulls-debug/'+'search_'+str(arrendatario)+'.jpeg', overlay_result)
                        # np.save('/data/Ponc/tracking/torch_data/resized/'+"{:05d}".format(arrendatario)+'.npy', torch_data_delta_size)

                    # np.save('/data/Ponc/tracking/results/mevasa/'+"{:05d}".format(arrendatario)+'.npy', attmap)
                else:  # empty mask
                    location = cxy_wh_2_rect(target_pos, target_sz)
                    rbox_in_img = np.array([
                        [location[0], location[1]],
                        [location[0] + location[2], location[1]],
                        [location[0] + location[2], location[1] + location[3]],
                        [location[0], location[1] + location[3]]
                    ])
        if (pscore[best_pscore_id] > best_score_threshold):
            break

    target_pos[0] = max(0, min(state['im_w'], target_pos[0]))
    target_pos[1] = max(0, min(state['im_h'], target_pos[1]))
    target_sz[0] = max(10, min(state['im_w'], target_sz[0]))
    target_sz[1] = max(10, min(state['im_h'], target_sz[1]))
    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    new_score = bboxes[4, 0]
    # state['score'] = score[best_pscore_id]
    state['score'] = new_score
    state['mask'] = mask_in_img if mask_enable else []
    state['ploygon'] = rbox_in_img if mask_enable else []
    return state, bboxes, rboxes
Example #8
0
def siamese_track(state,
                  im,
                  mask_enable=False,
                  refine_enable=False,
                  device='cpu',
                  debug=False):
    """
    对目标进行跟踪
    :param state:目标状态
    :param im:跟踪的图像帧
    :param mask_enable:是否进行掩膜
    :param refine_enable:是否进行特征的融合
    :param device:硬件信息
    :param debug: 是否进行debug
    :return:跟踪目标的状态 state字典
    """
    # 获取目标状态
    p = state['p']
    net = state['net']
    avg_chans = state['avg_chans']
    window = state['window']
    target_pos = state['target_pos']
    target_sz = state['target_sz']
    # 包含周边信息的跟踪框的宽度,高度,尺寸
    wc_x = target_sz[1] + p.context_amount * sum(target_sz)
    hc_x = target_sz[0] + p.context_amount * sum(target_sz)
    s_x = np.sqrt(wc_x * hc_x)
    # 模板模型输入框尺寸与跟踪框的比例
    scale_x = p.exemplar_size / s_x
    # 使用与模板分支相同的比例得到检测区域
    d_search = (p.instance_size - p.exemplar_size) / 2
    pad = d_search / scale_x
    s_x = s_x + 2 * pad
    # 对检测框进行扩展,包含周边信息
    crop_box = [
        target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2,
        round(s_x),
        round(s_x)
    ]
    # 若进行debug
    if debug:
        # 复制图片
        im_debug = im.copy()
        # 产生crop_box
        crop_box_int = np.int0(crop_box)
        # 将其绘制在图片上
        cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]),
                      (crop_box_int[0] + crop_box_int[2],
                       crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2)
        # 图片展示
        cv2.imshow('search area', im_debug)
        cv2.waitKey(0)

    # extract scaled crops for search region x at previous target position
    # 将目标位置按比例转换为要跟踪的目标
    x_crop = Variable(
        get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x),
                               avg_chans).unsqueeze(0))
    # 调用网络进行目标跟踪
    if mask_enable:
        # 进行目标分割
        score, delta, mask = net.track_mask(x_crop.to(device))
    else:
        # 只进行目标追踪,不进行分割
        score, delta = net.track(x_crop.to(device))
    # 目标框回归结果(将其转成4*...的样式)
    delta = delta.permute(1, 2, 3, 0).contiguous().view(4,
                                                        -1).data.cpu().numpy()
    # 目标分类结果(将其转成2*...的样式)
    score = F.softmax(score.permute(1, 2, 3,
                                    0).contiguous().view(2, -1).permute(1, 0),
                      dim=1).data[:, 1].cpu().numpy()
    # 计算目标框的中心点坐标,delta[0],delta[1],以及宽delta[2]和高delta[3],这里变量不是很明确。
    delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]
    delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1]
    delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]
    delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]

    def change(r):
        """
        将r和1/r逐位比较取最大值
        :param r:
        :return:
        """
        return np.maximum(r, 1. / r)

    def sz(w, h):
        """
        计算等效边长
        :param w: 宽
        :param h: 高
        :return: 等效边长
        """
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        """
        计算等效边长
        :param wh: 宽高的数组
        :return: 等效边长
        """
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # 尺寸惩罚 size penalty
    target_sz_in_crop = target_sz * scale_x
    s_c = change(sz(delta[2, :], delta[3, :]) /
                 (sz_wh(target_sz_in_crop)))  # scale penalty
    r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) /
                 (delta[2, :] / delta[3, :]))  # ratio penalty
    # p.penalty_k超参数
    penalty = np.exp(-(r_c * s_c - 1) * p.penalty_k)
    # 对分类结果进行惩罚
    pscore = penalty * score

    # cos window (motion model)
    # 窗口惩罚:按一定权值叠加一个窗分布值
    pscore = pscore * (1 - p.window_influence) + window * p.window_influence
    # 获取最优权值的索引
    best_pscore_id = np.argmax(pscore)
    # 将最优的预测结果映射回原图
    pred_in_crop = delta[:, best_pscore_id] / scale_x
    # 计算lr
    lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr  # lr for OTB
    # 计算目标的位置和尺寸:根据预测偏移得到目标位置和尺寸
    res_x = pred_in_crop[0] + target_pos[0]
    res_y = pred_in_crop[1] + target_pos[1]

    res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr
    res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr
    # 目标的位置和尺寸
    target_pos = np.array([res_x, res_y])
    target_sz = np.array([res_w, res_h])

    # for Mask Branch
    # 若进行分割
    if mask_enable:
        # 获取最优预测结果的位置索引:np.unravel_index:将平面索引或平面索引数组转换为坐标数组的元组
        best_pscore_id_mask = np.unravel_index(best_pscore_id,
                                               (5, p.score_size, p.score_size))
        delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1]
        # 是否进行特征融合
        if refine_enable:
            # 调用track_refine,运行 Refine 模块,由相关特征图上 1×1×256 的特征向量与检测下采样前的特征图得到目标掩膜
            mask = net.track_refine(
                (delta_y, delta_x)).to(device).sigmoid().squeeze().view(
                    p.out_size, p.out_size).cpu().data.numpy()
        else:
            # 不进行融合时直接生成掩膜数据
            mask = mask[0, :, delta_y, delta_x].sigmoid(). \
                squeeze().view(p.out_size, p.out_size).cpu().data.numpy()

        def crop_back(image, bbox, out_sz, padding=-1):
            """
            对图像进行仿射变换
            :param image: 图像
            :param bbox:
            :param out_sz: 输出尺寸
            :param padding: 是否进行扩展
            :return: 仿射变换后的结果
            """
            # 构造变换矩阵
            # 尺度系数
            a = (out_sz[0] - 1) / bbox[2]
            b = (out_sz[1] - 1) / bbox[3]
            # 平移量
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            # 进行仿射变换
            crop = cv2.warpAffine(image,
                                  mapping, (out_sz[0], out_sz[1]),
                                  flags=cv2.INTER_LINEAR,
                                  borderMode=cv2.BORDER_CONSTANT,
                                  borderValue=padding)
            return crop

        # 检测区域框长度与输入模型的大小的比值:缩放系数
        s = crop_box[2] / p.instance_size
        # 预测的模板区域框
        sub_box = [
            crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride * s,
            crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride * s,
            s * p.exemplar_size, s * p.exemplar_size
        ]
        # 缩放系数
        s = p.out_size / sub_box[2]
        # 背景框
        back_box = [
            -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s,
            state['im_h'] * s
        ]
        # 仿射变换
        mask_in_img = crop_back(mask, back_box, (state['im_w'], state['im_h']))
        # 得到掩膜结果
        target_mask = (mask_in_img > p.seg_thr).astype(np.uint8)
        # 根据cv2的版本查找轮廓
        if cv2.__version__[-5] == '4':
            # opencv4中返回的参数只有两个,其他版本有四个
            contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_NONE)
        else:
            _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                              cv2.CHAIN_APPROX_NONE)
        # 获取轮廓的面积
        cnt_area = [cv2.contourArea(cnt) for cnt in contours]
        if len(contours) != 0 and np.max(cnt_area) > 100:
            # 获取面积最大的轮廓
            contour = contours[np.argmax(cnt_area)]  # use max area polygon
            # 转换为...*2的形式
            polygon = contour.reshape(-1, 2)
            # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
            # 得到最小外接矩形后找到该矩形的四个顶点
            prbox = cv2.boxPoints(
                cv2.minAreaRect(polygon))  # Rotated Rectangle

            # box_in_img = pbox
            # 获得跟踪框
            rbox_in_img = prbox
        else:  # empty mask
            # 根据预测的目标位置和尺寸得到location
            location = cxy_wh_2_rect(target_pos, target_sz)
            # 得到跟踪框的四个顶点
            rbox_in_img = np.array(
                [[location[0], location[1]],
                 [location[0] + location[2], location[1]],
                 [location[0] + location[2], location[1] + location[3]],
                 [location[0], location[1] + location[3]]])
    # 得到目标的位置和尺寸
    target_pos[0] = max(0, min(state['im_w'], target_pos[0]))
    target_pos[1] = max(0, min(state['im_h'], target_pos[1]))
    target_sz[0] = max(10, min(state['im_w'], target_sz[0]))
    target_sz[1] = max(10, min(state['im_h'], target_sz[1]))
    # 更新state对象
    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    state['score'] = score[best_pscore_id]
    state['mask'] = mask_in_img if mask_enable else []
    state['ploygon'] = rbox_in_img if mask_enable else []
    return state
Example #9
0
                locations_dict[key].append(locations)
                target_sz_dict[key].append(target_sz)
                target_pos_dict[key].append(target_pos)

                # TODO: DRAW
                location = state['ploygon'].flatten()
                centroids1 = compute_centroid(location)
                mask = state['mask'] > state['p'].seg_thr
                im[:, :, 2] = (mask > 0) * 255 + (mask == 0) * im[:, :, 2]
                cv2.polylines(im, [np.int0(location).reshape((-1, 1, 2))],
                              True, col, 3)
                cv2.circle(im, (int(centroids1[0]), int(centroids1[1])), 3,
                           col, 2)

                location2 = cxy_wh_2_rect(target_pos, target_sz)
                rbox_in_img = np.array([
                    [location2[0], location2[1]],
                    [location2[0] + location2[2], location2[1]],
                    [location2[0] + location2[2], location2[1] + location2[3]],
                    [location2[0], location2[1] + location2[3]]
                ])
                location2 = rbox_in_img.flatten()

                cv2.polylines(im, [np.int0(location2).reshape((-1, 1, 2))],
                              True, col, 1)
                cv2.circle(im, (int(target_pos[0]), int(target_pos[1])), 3,
                           col, 1)

                # TODO: Decide winner with Dynamics AND UPDATE TRACKER IF REQUIRED
                tracker = dynamics[key]
Example #10
0
def TrackingDoing(model, state, im, mask_enable=False, device='cpu'):
    avg_chans = state['avg_chans']
    # type(state['avg_chans']) -- <class 'numpy.ndarray'>
    # (Pdb) state['avg_chans'].shape -- (3,)

    window = state['window']
    # (Pdb) state['window'] -- array([0., 0., 0., ..., 0., 0., 0.])
    # (Pdb) state['window'].shape -- (3125,)

    target_pos = state['target_pos']
    # (Pdb) state['target_pos'] -- array([390., 240.])
    # (Pdb) state['target_pos'].shape -- (2,)

    target_size = state['target_size']
    # (Pdb) state['target_size'] -- array([180, 280])
    # (Pdb) state['target_size'].shape -- (2,)

    # mask_enable = True
    s_x = get_scale_size(target_size[0], target_size[1])

    scale_x = model.template_size / s_x
    # s_x -- 457.27, scale_x -- 0.2777325006938416

    # p.instance_size -- 255, p.exemplar_size -- 127
    d_search = (model.instance_size - model.template_size) / 2
    pad = d_search / scale_x
    s_x = s_x + 2 * pad
    crop_box = [target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2, round(s_x), round(s_x)]
    # (Pdb) crop_box -- [-69.0, -219.0, 918, 918]

    # def get_subwindow_tracking(im, pos, model_sz, original_sz, avg_chans):
    x_crop = get_subwindow_tracking(im, target_pos, model.instance_size, round(s_x), avg_chans).unsqueeze(0)
    # (Pdb) pp x_crop.shape -- torch.Size([1, 3, 255, 255])

    score, delta, mask = model.track_mask(x_crop.to(device))
    # (Pdb) pp score.size()-- (torch.Size([1, 10, 25, 25]),
    # delta.size() -- torch.Size([1, 20, 25, 25])
    # mask.size() --  torch.Size([1, 3969, 25, 25]))

    delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy()
    score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0), dim=1).data[:,
            1].cpu().numpy()
    # delta.shape -- (4, 3125)
    # score.shape -- (3125,)

    delta[0, :] = delta[0, :] * model.anchor[:, 2] + model.anchor[:, 0]
    delta[1, :] = delta[1, :] * model.anchor[:, 3] + model.anchor[:, 1]
    delta[2, :] = np.exp(delta[2, :]) * model.anchor[:, 2]
    delta[3, :] = np.exp(delta[3, :]) * model.anchor[:, 3]

    def change(r):
        return np.maximum(r, 1. / r)

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    target_sz_in_crop = target_size*scale_x
    s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz_in_crop)))  # scale penalty
    r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) / (delta[2, :] / delta[3, :]))  # ratio penalty

    # p.penalty_k -- 0.04
    penalty = np.exp(-(r_c * s_c - 1) * model.penalty_k)
    pscore = penalty * score

    # cos window (motion model)
    # pp p.window_influence -- 0.4
    window_influence = 0.4
    pscore = pscore * (1 - window_influence) + window * window_influence

    best_pscore_id = np.argmax(pscore)

    pred_in_crop = delta[:, best_pscore_id] / scale_x
    lr = penalty[best_pscore_id] * score[best_pscore_id]  # lr for OTB

    res_x = pred_in_crop[0] + target_pos[0]
    res_y = pred_in_crop[1] + target_pos[1]

    res_w = target_size[0] * (1 - lr) + pred_in_crop[2] * lr
    res_h = target_size[1] * (1 - lr) + pred_in_crop[3] * lr

    target_pos = np.array([res_x, res_y])
    target_size = np.array([res_w, res_h])

    # for Mask Branch
    # pp mask_enable -- True
    if mask_enable:
        best_pscore_id_mask = np.unravel_index(best_pscore_id, (5, model.score_size, model.score_size))
        delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1]

        # pp model.template_size -- 127
        mask = model.track_refine((delta_y, delta_x)).to(device).sigmoid().squeeze().view(
            model.template_size, model.template_size).cpu().data.numpy()

        def crop_back(image, bbox, out_sz, padding=-1):
            a = (out_sz[0] - 1) / bbox[2]
            b = (out_sz[1] - 1) / bbox[3]
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            crop = cv2.warpAffine(image, mapping, (out_sz[0], out_sz[1]),
                                  flags=cv2.INTER_LINEAR,
                                  borderMode=cv2.BORDER_CONSTANT,
                                  borderValue=padding)
            return crop
        # pp p.instance_size -- 255
        s = crop_box[2] / model.instance_size
        # pp p.base_size -- 8
        # (Pdb) pp p.total_stride -- 8
        # pp p.exemplar_size -- 127

        sub_box = [crop_box[0] + (delta_x - model.anchors["base_size"] / 2) * model.anchors["stride"] * s,
                   crop_box[1] + (delta_y - model.anchors["base_size"] / 2) * model.anchors["stride"] * s,
                   s * model.template_size, s * model.template_size]
        s = model.template_size / sub_box[2]
        back_box = [-sub_box[0] * s, -sub_box[1] * s, state['image_width'] * s, state['image_height'] * s]
        mask_in_img = crop_back(mask, back_box, (state['image_width'], state['image_height']))
        # mask.shape -- (127, 127)
        # (Pdb) back_box -- [-44.833333333333336, -3.1666666666666683, 237.22222222222223, 133.33333333333334]
        # (Pdb) mask_in_img.shape -- (480 -- state['image_height'], 854 -- width)

        # pp p.segment_threshold -- 0.35
        target_mask = (mask_in_img > model.segment_threshold).astype(np.uint8)
        # cv2.__version__ -- '4.4.0' ==> cv2.__version__[-5] == '4'
        if cv2.__version__[-5] == '4':
            contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        else:
            _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        cnt_area = [cv2.contourArea(cnt) for cnt in contours]
        if len(contours) != 0 and np.max(cnt_area) > 100:
            contour = contours[np.argmax(cnt_area)]  # use max area polygon
            polygon = contour.reshape(-1, 2)
            # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
            prbox = cv2.boxPoints(cv2.minAreaRect(polygon))  # Rotated Rectangle

            rbox_in_img = prbox
        else:  # empty mask
            location = cxy_wh_2_rect(target_pos, target_size)
            rbox_in_img = np.array([[location[0], location[1]],
                                    [location[0] + location[2], location[1]],
                                    [location[0] + location[2], location[1] + location[3]],
                                    [location[0], location[1] + location[3]]])

    # type(state['image_width']) -- <class 'int'>
    target_pos[0] = max(0, min(state['image_width'], target_pos[0]))
    target_pos[1] = max(0, min(state['image_height'], target_pos[1]))
    target_size[0] = max(10, min(state['image_width'], target_size[0]))
    target_size[1] = max(10, min(state['image_height'], target_size[1]))

    state['target_pos'] = target_pos
    state['target_size'] = target_size
    state['score'] = score[best_pscore_id]
    state['mask'] = mask_in_img

    state['ploygon'] = rbox_in_img
    return state
Example #11
0
def track_vot(model,
              video,
              hp=None,
              mask_enable=False,
              refine_enable=False,
              device='cpu'):
    #regions记录目标框以及状态。
    regions = []  # result and states[1 init / 2 lost / 0 skip]
    image_files, gt = video['image_files'], video[
        'gt']  #gt是groundtruth(325, 8),数组,img_files是一个视频中所有路径构成的列表
    start_frame, end_frame, lost_times, toc = 0, len(
        image_files), 0, 0  #len(image_files)是视频帧数,每个视频不同

    #遍历当前视频中的所有图像,f是索引,image_file是单个帧的路径,由目标出现的帧初始化。
    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)  #(h*w*3)
        tic = cv2.getTickCount()  #记录当前时间
        if f == start_frame:  # init初始化,如果是第一帧
            cx, cy, w, h = get_axis_aligned_bbox(
                gt[f])  #将gt中任意方向的矩形(标识目标)转换成轴对称的矩形
            target_pos = np.array([cx, cy])  #轴对称矩形中心
            target_sz = np.array([w, h])
            state = siamese_init(im, target_pos, target_sz, model, hp,
                                 device)  # init tracker初始化跟踪器
            #输入是一帧图像数据,gt的(cx,cy)和(w,h), model, hp, device
            location = cxy_wh_2_rect(
                state['target_pos'], state['target_sz']
            )  #得到的location为轴对称矩形左上角坐标(x,y,w,h)ndarray<(4,), float64>
            regions.append(1 if 'VOT' in args.dataset else gt[f])
        elif f > start_frame:  # tracking ,接下来的所有帧,在后续帧跟踪,由state获取目标框。
            if f == 3:
                exit()
            state = siamese_track(state, im, mask_enable, refine_enable,
                                  device, args.debug)  # track
            if mask_enable:
                location = state['ploygon'].flatten()
                mask = state['mask']  #mask.shape和原图一样
            else:
                location = cxy_wh_2_rect(
                    state['target_pos'],
                    state['target_sz'])  #解码出的预测框左上角(x,y),(w,h)
                mask = []

            if 'VOT' in args.dataset:
                gt_polygon = ((gt[f][0], gt[f][1]), (gt[f][2], gt[f][3]),
                              (gt[f][4], gt[f][5]), (gt[f][6], gt[f][7]))
                if mask_enable:
                    pred_polygon = ((location[0], location[1]), (location[2],
                                                                 location[3]),
                                    (location[4], location[5]), (location[6],
                                                                 location[7]))
                else:
                    pred_polygon = ((location[0], location[1]),
                                    (location[0] + location[2],
                                     location[1]), (location[0] + location[2],
                                                    location[1] + location[3]),
                                    (location[0], location[1] + location[3]))


#                无论怎样locatioon都是预测得到的,计算预测和实际的多边形之间的重叠
                b_overlap = vot_overlap(gt_polygon, pred_polygon,
                                        (im.shape[1], im.shape[0]))
            else:  #vot_overlap 计算两个多边形gt_polygon, pred_polygon之间的重叠。
                b_overlap = 1

            if b_overlap:  #值为真,有重叠
                regions.append(location)
            else:  # lost
                regions.append(2)
                lost_times += 1
                start_frame = f + 5  # skip 5 frames
        else:  # skip
            regions.append(0)
        toc += cv2.getTickCount() - tic

        #处理完one video后进行显示
        if args.visualization and f >= start_frame:  # visualization (skip lost frame)
            im_show = im.copy()
            if f == 0: cv2.destroyAllWindows()
            if gt.shape[0] > f:
                if len(gt[f]) == 8:
                    cv2.polylines(
                        im_show, [np.array(gt[f], np.int).reshape(
                            (-1, 1, 2))], True, (0, 255, 0), 3)
                else:
                    cv2.rectangle(im_show, (gt[f, 0], gt[f, 1]),
                                  (gt[f, 0] + gt[f, 2], gt[f, 1] + gt[f, 3]),
                                  (0, 255, 0), 3)
            if len(location) == 8:
                if mask_enable:
                    mask = mask > state['p'].seg_thr
                    im_show[:, :,
                            2] = mask * 255 + (1 - mask) * im_show[:, :, 2]
                location_int = np.int0(location)
                cv2.polylines(im_show, [location_int.reshape((-1, 1, 2))],
                              True, (0, 255, 255), 3)
            else:
                location = [int(l) for l in location]
                cv2.rectangle(
                    im_show, (location[0], location[1]),
                    (location[0] + location[2], location[1] + location[3]),
                    (0, 255, 255), 3)
            cv2.putText(im_show, str(f), (40, 40), cv2.FONT_HERSHEY_SIMPLEX, 1,
                        (0, 255, 255), 2)
            cv2.putText(im_show, str(lost_times), (40, 80),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.putText(im_show,
                        str(state['score']) if 'score' in state else '',
                        (40, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            cv2.imshow(video['name'], im_show)
            cv2.waitKey(1)
    toc /= cv2.getTickFrequency()

    #    下面是后续处理工作
    # save result跟踪完成,记录结果到文本文件。
    name = args.arch.split('.')[0] + '_' + ('mask_' if mask_enable else '') + ('refine_' if refine_enable else '') +\
           args.resume.split('/')[-1].split('.')[0]

    if 'VOT' in args.dataset:
        video_path = join('test', args.dataset, name, 'baseline',
                          video['name'])
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}_001.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write("{:d}\n".format(x)) if isinstance(x, int) else \
                        fin.write(','.join([vot_float2str("%.4f", i) for i in x]) + '\n')
    else:  # OTB
        video_path = join('test', args.dataset, name)
        if not isdir(video_path): makedirs(video_path)
        result_path = join(video_path, '{:s}.txt'.format(video['name']))
        with open(result_path, "w") as fin:
            for x in regions:
                fin.write(','.join([str(i) for i in x]) + '\n')

    logger.info(
        '({:d}) Video: {:12s} Time: {:02.1f}s Speed: {:3.1f}fps Lost: {:d}'.
        format(v_id, video['name'], toc, f / toc, lost_times))

    return lost_times, f / toc
Example #12
0
def siamese_track(state,
                  im,
                  mask_enable=False,
                  refine_enable=False,
                  device='cpu',
                  debug=False):
    p = state['p']  #state['p']是TrackerConfig类
    net = state['net']  #siamese_init中的model,模型即是网络
    avg_chans = state['avg_chans']
    window = state['window']
    target_pos = state['target_pos']  #中心点坐标
    target_sz = state['target_sz']  #(w,h)

    #由扩展后的宽高计算等效面积。使用与模板分支相同的缩放系数得到检测区域。
    wc_x = target_sz[1] + p.context_amount * sum(
        target_sz)  #h +p.context_amount*(w+h)
    hc_x = target_sz[0] + p.context_amount * sum(
        target_sz)  #w +p.context_amount*(w+h)
    s_x = np.sqrt(
        wc_x * hc_x
    )  #这个s_x是作为template时,以物体为中心,s_x为宽高,截取一个正方体的物体出来,然后再resize到(127,127),这个s_x时框的大约2倍的放大
    scale_x = p.exemplar_size / s_x  #scale_x是放大的倍数
    d_search = (p.instance_size - p.exemplar_size) / 2  #64
    pad = d_search / scale_x  #pad = 64*s_x/127
    s_x = s_x + 2 * pad  #这个s_x是作为search时,以物体为中心,s_x为宽高,截取一个正方体的物体出来,然后再resize到(255,255),这个s_x时框的大约4倍的放大
    crop_box = [
        target_pos[0] - round(s_x) / 2, target_pos[1] - round(s_x) / 2,
        round(s_x),
        round(s_x)
    ]  ##(x上,y上,w,h)
    #crop_box就是search未resize的原图

    if debug:
        im_debug = im.copy()
        crop_box_int = np.int0(crop_box)  #np.int0向下取整
        cv2.rectangle(im_debug, (crop_box_int[0], crop_box_int[1]),
                      (crop_box_int[0] + crop_box_int[2],
                       crop_box_int[1] + crop_box_int[3]), (255, 0, 0), 2)
        cv2.imshow('search area', im_debug)
        cv2.waitKey(0)
    #提取按比例缩放的剪裁在之前的目标位置,为x
    # extract scaled crops for search region x at previous target position以上一帧的target_pos为依据生成search,毕竟物体位置差别不大。
    x_crop = Variable(
        get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x),
                               avg_chans).unsqueeze(0))
    #x_corp.shape = [255,255,3]----->(1, 3, 255, 255), float32, cpu>

    #运行网络
    if mask_enable:  #如果运训mask分支的话
        score, delta, mask = net.track_mask(x_crop.to(device))  #三分支分别的结果
#        New var:....... score = tensor<(1, 10, 25, 25), float32, cuda:0, grad>   2*k = 10
#        New var:....... delta = tensor<(1, 20, 25, 25), float32, cuda:0, grad>   4*k = 20  k = 5
#        New var:....... mask = tensor<(1, 3969, 25, 25), float32, cuda:0, grad>
    else:
        score, delta = net.track(x_crop.to(device))

#解码出预测框,并根据位置、宽高比和位移量惩罚得分,挑选出最优预测。torch.permute(dims),将tensor的维度换位。
#即使用transpose或permute进行维度变换后,调用contiguous,然后方可使用view对维度进行变形。
#.data.cpu().numpy()  GPUtensor-->CPUtensor-->numpy
#.data[:,1],取tensor所有行的第二列
    delta = delta.permute(1, 2, 3, 0).contiguous().view(
        4, -1).data.cpu().numpy()  #ndarray<(4, 3125), float32>
    score = F.softmax(
        score.permute(1, 2, 3, 0).contiguous().view(2, -1).permute(1, 0),
        dim=1
    ).data[:, 1].cpu().numpy(
    )  #score = ndarray<(3125,), float32>,,torch.nn.functional.softmax(input)非线性激活函数

    delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]  #cx = cx *w+cx
    delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:,
                                                          1]  #cy = cy * h+cy
    delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]  #w = exp(w) *w
    delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]  #h = exp(h) * h

    #np.maximum:(X, Y, out=None)
    #X 与 Y 逐位比较取其大者;
    def change(r):
        return np.maximum(
            r, 1. /
            r)  #[0.33, 0.5, 1, 2, 3],[3.03,2,1, 0.5,0.333] --->[3, 2, 1, 2, 3]

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    target_sz_in_crop = target_sz * scale_x  #target_sz_in_crop = ndarray<(2,), float64>
    s_c = change(sz(delta[2, :], delta[3, :]) / (
        sz_wh(target_sz_in_crop)))  # scale penalty   ndarray<(3125,), float32>
    r_c = change((target_sz_in_crop[0] / target_sz_in_crop[1]) /
                 (delta[2, :] /
                  delta[3, :]))  # ratio penalty   ndarray<(3125,), float32>

    penalty = np.exp(-(r_c * s_c - 1) *
                     p.penalty_k)  #ndarray<(3125,), float32>
    pscore = penalty * score  #ndarray<(3125,), float32>

    # cos window (motion model)
    pscore = pscore * (
        1 - p.window_influence
    ) + window * p.window_influence  #ndarray<(3125,), float64>
    best_pscore_id = np.argmax(
        pscore)  #挑选出得分最高的。通过class score分支的最高得分选取一根柱子用于生成mask,同时也将对应的最优框选出来了

    pred_in_crop = delta[:,
                         best_pscore_id] / scale_x  #找到在search中偏差的位置,用于选择最优框pred_in_crop = ndarray<(4,), float32>,其实是偏差

    lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr  # lr for OTB

    res_x = pred_in_crop[0] + target_pos[0]  #x的偏差加上原来的x
    res_y = pred_in_crop[1] + target_pos[1]  #y的偏差加上原来的y

    res_w = target_sz[0] * (1 - lr) + pred_in_crop[2] * lr
    res_h = target_sz[1] * (1 - lr) + pred_in_crop[3] * lr

    target_pos = np.array([res_x, res_y])  #从这里往下,target_pos和target_sz改变。
    target_sz = np.array([res_w, res_h])  #解码出来的框,就一个

    # for Mask Branch
    #numpy.unravel_index 将平面索引或平面索引数组转换为坐标数组的元组。
    #由best_pscore_id得到特征图上的torchsnooper位置。track_refine 函数运行 Refine 模块,
    #由相关特征图上 1×1×256的特征向量与检测下采样前的特征图得到目标掩膜。

    #上面的mask是整体的,现在要得出一根ROW
    if mask_enable:  #允许mask

        best_pscore_id_mask = np.unravel_index(
            best_pscore_id,
            (5, p.score_size,
             p.score_size))  #(3,delta_y,delta_x) 在pscore中找对应的mask中的位置
        delta_x, delta_y = best_pscore_id_mask[2], best_pscore_id_mask[1]
        #根据最高得分id选择一根最优柱子用于生成mask,现在已经有坐标位置了(delta_y, delta_x)
        if refine_enable:  #用sharprefine模块
            mask = net.track_refine((delta_y, delta_x)).to(device).sigmoid(
            ).squeeze().view(p.out_size, p.out_size).cpu().data.numpy(
            )  #mask = <tensor<(1, 16129)------>darray<(127, 127), float32>
        else:  #不用sharprefine模块,而是选出最高得分的mask
            mask = mask[0, :, delta_y, delta_x].sigmoid(). \
                squeeze().view(p.out_size, p.out_size).cpu().data.numpy()    #ndarray<(127, 127), float32>,因为p用model.anchors更新了一下

        #上面生成了mask,现在要映射回原图,warpAffine() 对图像应用仿射变换。
        #手动构造变换矩阵mapping,a和b为尺度系数,c和d为平移量。
        def crop_back(image, bbox, out_sz, padding=-1):
            a = (out_sz[0] - 1) / bbox[2]
            b = (out_sz[1] - 1) / bbox[3]
            c = -a * bbox[0]
            d = -b * bbox[1]
            mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float)
            crop = cv2.warpAffine(image,
                                  mapping, (out_sz[0], out_sz[1]),
                                  flags=cv2.INTER_LINEAR,
                                  borderMode=cv2.BORDER_CONSTANT,
                                  borderValue=padding)
            return crop

        #crop_box为检测截取框,格式为[x,y,width,height]。s为缩放系数。sub_box为预测的模板区域框。
        s = crop_box[2] / p.instance_size  #s是标量,,round(s_x)/255
        sub_box = [
            crop_box[0] + (delta_x - p.base_size / 2) * p.total_stride *
            s,  #x上 + (delta_x - 4) * 8 * s
            crop_box[1] + (delta_y - p.base_size / 2) * p.total_stride *
            s,  #y上 + (delta_y - 4) * 8 * s
            s * p.exemplar_size,
            s * p.exemplar_size
        ]  #列表,四个元素      #s*127,s*127
        s = p.out_size / sub_box[2]
        back_box = [
            -sub_box[0] * s, -sub_box[1] * s, state['im_w'] * s,
            state['im_h'] * s
        ]  #列表,四个元素

        #输入的mask为ndarray<(127, 127), float32>
        mask_in_img = crop_back(
            mask, back_box,
            (state['im_w'], state['im_h']))  #和im原视频帧size一样,,float32
        target_mask = (mask_in_img > p.seg_thr).astype(
            np.uint8)  ##和im原视频帧size一样,uint8>二维 0-1值
        if cv2.__version__[-5] == '4':
            contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_NONE)
        else:  #这个成立我的版本是3.*
            _, contours, _ = cv2.findContours(target_mask, cv2.RETR_EXTERNAL,
                                              cv2.CHAIN_APPROX_NONE)  #三维,就一个轮廓

        cnt_area = [cv2.contourArea(cnt) for cnt in contours]
        if len(contours) != 0 and np.max(
                cnt_area) > 100:  #有轮廓并且最大的轮廓面积大于100,说明轮廓不小
            contour = contours[np.argmax(
                cnt_area
            )]  # use max area polygon,,(n , 1 , 2)维的numpy.ndarray,n是坐标的个数
            polygon = contour.reshape(-1, 2)  #二维
            # pbox = cv2.boundingRect(polygon)  # Min Max Rectangle
            prbox = cv2.boxPoints(cv2.minAreaRect(
                polygon))  # Rotated RectangleboxPoints 查找旋转矩形的四个顶点。用于绘制旋转的矩形。
            #cv2.minAreaRect(polygon),生成最小外接矩形,输入时多边形点集必须是array数组的形式,输出是(中心(x,y), (宽,高), 旋转角度)
            #cv2.boxPoints(rect)获取最小外接矩形的4个顶点坐标,返回形式[ [x0,y0], [x1,y1], [x2,y2], [x3,y3] ]。
            #prbox = ndarray<(4, 2),就是得到的目标框的四个顶点,俗称旋转框,因为有角度

            # box_in_img = pbox
            rbox_in_img = prbox  #rbox_in_img = ndarray<(4, 2), float32>
        else:  # empty mask轮廓太小的话
            location = cxy_wh_2_rect(target_pos,
                                     target_sz)  #得到左上角坐标(x,y)和(w,h)
            rbox_in_img = np.array(
                [[location[0], location[1]],
                 [location[0] + location[2], location[1]],
                 [location[0] + location[2], location[1] + location[3]],
                 [location[0], location[1] + location[3]]])

#    到此,rbox_in_img就是经过“和原图size一样大的mask”经过轮廓操作或者直接由预测的(target_pos, target_sz)生成的--目标框的四个顶点

#由结果更新状态。
    target_pos[0] = max(0, min(state['im_w'], target_pos[0]))
    target_pos[1] = max(0, min(state['im_h'], target_pos[1]))
    target_sz[0] = max(10, min(state['im_w'], target_sz[0]))
    target_sz[1] = max(10, min(state['im_h'], target_sz[1]))

    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    state['score'] = score[best_pscore_id]
    state['mask'] = mask_in_img if mask_enable else []
    state['ploygon'] = rbox_in_img if mask_enable else []
    return state