Exemple #1
0
    def forward(self, in_x, rois, in_ref_x=None, ref_rois=None):
        # x: [N_0, C]      ref_x: [N_1, C]
        # rois: [N_0, 4]   ref_rois: [N_1, 4]
        if in_ref_x is None:
            in_ref_x = in_x
            ref_rois = rois
        N_0, C = in_x.shape
        N_1, C_1 = in_ref_x.shape
        assert C == C_1
        x = in_x.view(N_0, C, 1, 1)
        ref_x = in_ref_x.view(N_0, C, 1, 1)

        for i, embed_conv in enumerate(self.embed_convs):
            x = embed_conv(x)
            if not self.share_embed_convs:
                ref_x = self.ref_embed_convs[i](ref_x)
            else:
                ref_x = embed_conv(ref_x)

        # [N, G, C // G]
        x = x.view(N_0, self.groups, -1)
        ref_x = ref_x.view(N_1, self.groups, -1)
        # [G, N_0, C // G]
        x = x.permute(1, 0, 2)
        # [G, C // G, N_1]
        ref_x = ref_x.permute(1, 2, 0)
        # [G, N_0, N_1]
        matrix = torch.matmul(x, ref_x)
        matrix /= x.shape[-1]**0.5
        # [N_0, G, N_1]
        matrix = matrix.permute(1, 0, 2)

        if self.with_loc:
            # [N_0, N_1]
            ious = bbox_overlaps(rois[:, 1:], ref_rois[:, 1:])
            ious = ious.view(N_0, 1, N_1).expand(N_0, self.groups, N_1)
            matrix += torch.log(ious + 1e-6)

        # [N_0, G, N_1]
        matrix = matrix.softmax(dim=2)
        # [N_0 * G, N_1]
        matrix = matrix.view(-1, N_1)
        # [N_0 * G, C] = [N_0 * G, N_1] * [N_1, C]
        y = torch.matmul(matrix, in_ref_x)
        # [N_0, C * G]
        y = y.view(N_0, -1, 1, 1)
        # [N_0, C]
        y = self.conv_out(y).view(N_0, -1)
        return y
Exemple #2
0
def iou_loss(pred, target, eps=1e-6):
    """IoU loss.

    Computing the IoU loss between a set of predicted bboxes and target bboxes.
    The loss is calculated as negative log of IoU.

    Args:
        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
            shape (n, 4).
        target (Tensor): Corresponding gt bboxes, shape (n, 4).
        eps (float): Eps to avoid log(0).

    Return:
        Tensor: Loss tensor.
    """
    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
    loss = -ious.log()
    return loss
Exemple #3
0
    def match(self,
              bboxes: torch.Tensor,
              labels: torch.Tensor,
              boxes_3d: torch.Tensor,
              depth_uncertainty: torch.Tensor,
              position: torch.Tensor,
              rotation: torch.Tensor,
              embeds: torch.Tensor,
              cur_frame: int,
              pure_det: bool = False):
        """Match incoming detection results with embedding and 3D infos

        Args:
            bboxes (torch.tensor): (N, 5), [x1, y1, x2, y2, conf]
            labels (torch.tensor): (N,)
            boxes_3d (torch.tensor): (N, 7), 3D information stored
                                     in world coordinates with the format
                                     [X, Y, Z, theta, h, w, l]
            depth_uncertainty (torch.tensor): (N, ), confidence in depth 
                                     estimation
            position (torch.tensor): (3, ), camera position
            rotation (torch.tensor): (3, 3), camera rotation
            embeds (torch.tensor): (N, C), extracted box feature
            cur_frame (int): indicates the frame index
            pure_det (bool): output pure detection. Defaults False.

        Raises:
            NotImplementedError: raise if self.match_metric not found

        Returns:
            list: A list of matched bbox, labels, boxes_3d and embeds
        """
        if depth_uncertainty is None or not self.with_depth_uncertainty:
            depth_uncertainty = boxes_3d.new_ones((boxes_3d.shape[0], 1))

        _, inds = (bboxes[:, -1] *
                   depth_uncertainty.flatten()).sort(descending=True)
        bboxes = bboxes[inds, :]
        labels = labels[inds]
        embeds = embeds[inds, :]
        boxes_3d = boxes_3d[inds]
        depth_uncertainty = depth_uncertainty[inds]

        if pure_det:
            valids = bboxes.new_ones((bboxes.size(0)), dtype=torch.bool)
            ids = torch.arange(self.num_tracklets,
                               self.num_tracklets + bboxes.size(0),
                               dtype=torch.long)
            self.num_tracklets += bboxes.size(0)
            return bboxes, labels, boxes_3d, ids, inds, valids

        # duplicate removal for potential backdrops and cross classes
        valids = bboxes.new_ones((bboxes.size(0)))
        ious = bbox_overlaps(bboxes[:, :-1], bboxes[:, :-1])
        for i in range(1, bboxes.size(0)):
            thr = self.nms_backdrop_iou_thr if bboxes[
                i, -1] < self.obj_score_thr else self.nms_class_iou_thr
            if (ious[i, :i] > thr).any():
                valids[i] = 0
        valids = valids == 1
        bboxes = bboxes[valids, :]
        labels = labels[valids]
        embeds = embeds[valids, :]
        boxes_3d = boxes_3d[valids]
        depth_uncertainty = depth_uncertainty[valids]

        # init ids container
        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)

        # match if buffer is not empty
        if bboxes.size(0) > 0 and not self.empty:
            memo_bboxes, memo_labels, memo_boxes_3d, \
                memo_trackers, memo_embeds, memo_ids, memo_vs = self.memo

            mmcv.check_accum_time('predict', counting=True)
            memo_boxes_3d_predict = memo_boxes_3d.detach().clone()
            for ind, memo_tracker in enumerate(memo_trackers):
                memo_velo = memo_tracker.predict(
                    update_state=memo_tracker.age != 0)
                memo_boxes_3d_predict[ind, :3] += memo_boxes_3d.new_tensor(
                    memo_velo[7:])
            mmcv.check_accum_time('predict', counting=False)

            if self.with_bbox_iou:

                def get_xy_box(boxes_3d_world):
                    box_x_cen = boxes_3d_world[:, 0]
                    box_y_cen = boxes_3d_world[:, 1]
                    box_width = boxes_3d_world[:, 5]
                    box_length = boxes_3d_world[:, 6]

                    dets_xy_box = torch.stack([
                        box_x_cen - box_width / 2.0, box_y_cen -
                        box_length / 2.0, box_x_cen + box_width / 2.0,
                        box_y_cen + box_length / 2.0
                    ],
                                              dim=1)
                    return dets_xy_box

                if self.track_bbox_iou == 'box2d':
                    scores_iou = bbox_overlaps(bboxes[:, :-1],
                                               memo_bboxes[:, :-1])
                elif self.track_bbox_iou == 'bev':
                    dets_xy_box = get_xy_box(boxes_3d)
                    memo_dets_xy_box = get_xy_box(memo_boxes_3d_predict)
                    scores_iou = bbox_overlaps(dets_xy_box, memo_dets_xy_box)
                elif self.track_bbox_iou == 'box3d':
                    depth_weight = F.pairwise_distance(
                        boxes_3d[..., None],
                        memo_boxes_3d_predict[..., None].transpose(2, 0))
                    scores_iou = torch.exp(-depth_weight / 10.0)
                elif self.track_bbox_iou == 'box2d_depth_aware':
                    depth_weight = F.pairwise_distance(
                        boxes_3d[..., None],
                        memo_boxes_3d_predict[..., None].transpose(2, 0))
                    scores_iou = torch.exp(-depth_weight / 10.0)
                    scores_iou *= bbox_overlaps(bboxes[:, :-1],
                                                memo_bboxes[:, :-1])
                else:
                    raise NotImplementedError
            else:
                scores_iou = bboxes.new_ones(
                    [bboxes.size(0), memo_bboxes.size(0)])

            if self.with_deep_feat:

                def compute_quasi_dense_feat_match(embeds, memo_embeds):
                    if self.match_metric == 'cycle_softmax':
                        feats = torch.mm(embeds, memo_embeds.t())
                        d2t_scores = feats.softmax(dim=1)
                        t2d_scores = feats.softmax(dim=0)
                        scores_feat = (d2t_scores + t2d_scores) / 2
                    elif self.match_metric == 'softmax':
                        feats = torch.mm(embeds, memo_embeds.t())
                        scores_feat = feats.softmax(dim=1)
                    elif self.match_metric == 'cosine':
                        scores_feat = torch.mm(
                            F.normalize(embeds, p=2, dim=1),
                            F.normalize(memo_embeds, p=2, dim=1).t())
                    else:
                        raise NotImplementedError
                    return scores_feat

                scores_feat = compute_quasi_dense_feat_match(
                    embeds, memo_embeds)
            else:
                scores_feat = scores_iou.new_ones(scores_iou.shape)

            # Match with depth ordering
            if self.with_depth_ordering:

                def compute_boxoverlap_with_depth(obsv_boxes_3d, memo_boxes_3d,
                                                  memo_vs):
                    # Sum up all the available region of each tracker
                    if self.depth_match_metric == 'centroid':
                        depth_weight = F.pairwise_distance(
                            obsv_boxes_3d[..., :3, None],
                            memo_boxes_3d[..., :3, None].transpose(2, 0))
                        depth_weight = torch.exp(-depth_weight / 10.0)
                    elif self.depth_match_metric == 'cosine':
                        match_corners_observe = tu.worldtocamera_torch(
                            obsv_boxes_3d[:, :3], position, rotation)
                        match_corners_predict = tu.worldtocamera_torch(
                            memo_boxes_3d[:, :3], position, rotation)
                        depth_weight = F.cosine_similarity(
                            match_corners_observe[..., None],
                            match_corners_predict[..., None].transpose(2, 0))
                        depth_weight += 1.0
                        depth_weight /= 2.0
                    elif self.depth_match_metric == 'pure_motion':
                        # Moving distance should be aligned
                        # V_observed-tracked vs. V_velocity
                        depth_weight = F.pairwise_distance(
                            obsv_boxes_3d[..., :3, None] -
                            memo_boxes_3d[..., :3, None].transpose(2, 0),
                            memo_vs[..., :3, None].transpose(2, 0))
                        depth_weight = torch.exp(-depth_weight / 5.0)
                        # Moving direction should be aligned
                        # Set to 0.5 when two vector not within +-90 degree
                        cos_sim = F.cosine_similarity(
                            obsv_boxes_3d[..., :2, None] -
                            memo_boxes_3d[..., :2, None].transpose(2, 0),
                            memo_vs[..., :2, None].transpose(2, 0))
                        cos_sim += 1.0
                        cos_sim /= 2.0
                        depth_weight *= cos_sim
                    elif self.depth_match_metric == 'motion':
                        centroid_weight = F.pairwise_distance(
                            obsv_boxes_3d[..., :3, None],
                            memo_boxes_3d_predict[..., :3,
                                                  None].transpose(2, 0))
                        centroid_weight = torch.exp(-centroid_weight / 10.0)
                        # Moving distance should be aligned
                        # V_observed-tracked vs. V_velocity
                        motion_weight = F.pairwise_distance(
                            obsv_boxes_3d[..., :3, None] -
                            memo_boxes_3d[..., :3, None].transpose(2, 0),
                            memo_vs[..., :3, None].transpose(2, 0))
                        motion_weight = torch.exp(-motion_weight / 5.0)
                        # Moving direction should be aligned
                        # Set to 0.5 when two vector not within +-90 degree
                        cos_sim = F.cosine_similarity(
                            obsv_boxes_3d[..., :2, None] -
                            memo_boxes_3d[..., :2, None].transpose(2, 0),
                            memo_vs[..., :2, None].transpose(2, 0))
                        cos_sim += 1.0
                        cos_sim /= 2.0
                        depth_weight = cos_sim * centroid_weight + (
                            1.0 - cos_sim) * motion_weight
                    else:
                        raise NotImplementedError

                    return depth_weight

                if self.depth_match_metric == 'motion':
                    scores_depth = compute_boxoverlap_with_depth(
                        boxes_3d, memo_boxes_3d, memo_vs)
                else:
                    scores_depth = compute_boxoverlap_with_depth(
                        boxes_3d, memo_boxes_3d_predict, memo_vs)
            else:
                scores_depth = scores_iou.new_ones(scores_iou.shape)

            if self.with_cats:
                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
                scores_cats = cat_same.float()
            else:
                scores_cats = scores_iou.new_ones(scores_iou.shape)

            scores = self.bbox_affinity_weight * scores_iou * scores_depth + \
                self.feat_affinity_weight * scores_feat
            scores /= (self.bbox_affinity_weight + self.feat_affinity_weight)
            scores *= (scores_iou > 0.0).float()
            scores *= (scores_depth > 0.0).float()
            scores *= scores_cats

            # Assign matching
            mmcv.check_accum_time('matching', counting=True)
            if self.match_algo == 'greedy':
                for i in range(bboxes.size(0)):
                    conf, memo_ind = torch.max(scores[i, :], dim=0)
                    tid = memo_ids[memo_ind]
                    # Matching confidence
                    if conf > self.match_score_thr:
                        # Update existing tracklet
                        if tid > -1:
                            # Keep object with high 3D objectness
                            if bboxes[i, -1] * depth_uncertainty[
                                    i] > self.obj_score_thr:
                                ids[i] = tid
                                scores[:i, memo_ind] = 0
                                scores[i + 1:, memo_ind] = 0
                            else:
                                # Reduce FP w/ low objectness but high match conf
                                if conf > self.nms_conf_thr:
                                    ids[i] = -2
            elif self.match_algo == 'hungarian':
                # Hungarian
                matched_indices = linear_assignment(-scores.cpu().numpy())
                for idx in range(len(matched_indices[0])):
                    i = matched_indices[0][idx]
                    memo_ind = matched_indices[1][idx]
                    conf = scores[i, memo_ind]
                    tid = memo_ids[memo_ind]
                    if conf > self.match_score_thr and tid > -1:
                        # Keep object with high 3D objectness
                        if bboxes[i, -1] * depth_uncertainty[
                                i] > self.obj_score_thr:
                            ids[i] = tid
                            scores[:i, memo_ind] = 0
                            scores[i + 1:, memo_ind] = 0
                        else:
                            # Reduce FP w/ low objectness but high match conf
                            if conf > self.nms_conf_thr:
                                ids[i] = -2
                del matched_indices
            mmcv.check_accum_time('matching', counting=False)

            if self._debug:
                bbox_inds = scores.max(1).indices
                print("\nTotal:\n"
                      f"Scores: {scores.max(1).values.cpu().numpy()}\n"
                      f"Indices: {scores.max(1).indices.cpu().numpy()}\n"
                      "IoU:\n"
                      f"Scores: {scores_iou.max(1).values.cpu().numpy()}\n"
                      f"Indices: {scores_iou.max(1).indices.cpu().numpy()}\n"
                      f"Feat:\n"
                      f"Scores: {scores_feat.max(1).values.cpu().numpy()}\n"
                      f"Indices: {scores_feat.max(1).indices.cpu().numpy()}\n"
                      f"Depth:\n"
                      f"Scores: {scores_depth.max(1).values.cpu().numpy()}\n"
                      f"Indices: {scores_depth.max(1).indices.cpu().numpy()}")
                print("Uncertainty:\n",
                      depth_uncertainty.flatten().cpu().numpy())
                print(f"Memo: {memo_boxes_3d.shape}\tMemo Ids: {memo_ids}\n"
                      f"{memo_boxes_3d[bbox_inds].cpu().numpy()}")
                print(f"Velo:\n{memo_vs[bbox_inds].cpu().numpy()}")
                print(
                    f"Pred:\n{memo_boxes_3d_predict[bbox_inds].cpu().numpy()}")
                print(f"Obsv: {boxes_3d.shape}\tObsv Ids: {ids}\n"
                      f"{boxes_3d.cpu().numpy()}")

        new_inds = (ids == -1) & (bboxes[:, 4] > self.init_score_thr).cpu()
        num_news = new_inds.sum()
        ids[new_inds] = torch.arange(self.num_tracklets,
                                     self.num_tracklets + num_news,
                                     dtype=torch.long)
        self.num_tracklets += num_news

        self.update_memo(ids, bboxes, boxes_3d, depth_uncertainty, embeds,
                         labels, cur_frame)

        update_bboxes = bboxes.detach().clone()
        update_labels = labels.detach().clone()
        update_boxes_3d = boxes_3d.detach().clone()
        for tid in ids[ids > -1]:
            update_boxes_3d[ids == tid] = self.tracklets[int(tid)]['box_3d']
        update_ids = ids.detach().clone()

        if self._debug:
            print(
                f"Updt: {update_boxes_3d.shape}\tUpdt ID: {update_ids.cpu().numpy()}\n"
                f"{update_boxes_3d.cpu().numpy()}")

        return update_bboxes, update_labels, update_boxes_3d, update_ids, inds, valids
Exemple #4
0
    def update_memo(self, ids, bboxes, boxes_3d, depth_uncertainty, embeds,
                    labels, cur_frame):
        tracklet_inds = ids > -1

        # update memo
        for tid, bbox, box_3d, d_uncertainty, embed, label in zip(
                ids[tracklet_inds], bboxes[tracklet_inds],
                boxes_3d[tracklet_inds], depth_uncertainty[tracklet_inds],
                embeds[tracklet_inds], labels[tracklet_inds]):
            tid = int(tid)
            if tid in self.tracklets.keys():
                self.tracklets[tid]['bbox'] = bbox

                mmcv.check_accum_time('update', counting=True)
                self.tracklets[tid]['tracker'].update(
                    box_3d.cpu().numpy(),
                    d_uncertainty.cpu().numpy())
                mmcv.check_accum_time('update', counting=False)

                tracker_box = self.tracklets[tid]['tracker'].get_state()[:7]
                pd_box_3d = box_3d.new_tensor(tracker_box)

                velocity = (pd_box_3d - self.tracklets[tid]['box_3d']) / (
                    cur_frame - self.tracklets[tid]['last_frame'])

                self.tracklets[tid]['box_3d'] = pd_box_3d
                self.tracklets[tid]['embed'] += self.memo_momentum * (
                    embed - self.tracklets[tid]['embed'])
                self.tracklets[tid]['label'] = label
                self.tracklets[tid]['velocity'] = (
                    self.tracklets[tid]['velocity'] *
                    self.tracklets[tid]['acc_frame'] +
                    velocity) / (self.tracklets[tid]['acc_frame'] + 1)
                self.tracklets[tid]['last_frame'] = cur_frame
                self.tracklets[tid]['acc_frame'] += 1
            else:
                built_tracker = self.tracker_model(
                    self.device, self.lstm,
                    box_3d.cpu().numpy(),
                    d_uncertainty.cpu().numpy()
                ) if self.tracker_model_name == 'LSTM3DTracker' else self.tracker_model(
                    box_3d.cpu().numpy(),
                    d_uncertainty.cpu().numpy())
                self.tracklets[tid] = dict(bbox=bbox,
                                           box_3d=box_3d,
                                           tracker=built_tracker,
                                           embed=embed,
                                           label=label,
                                           last_frame=cur_frame,
                                           velocity=torch.zeros_like(box_3d),
                                           acc_frame=0)

        # Handle vanished tracklets
        for tid in self.tracklets:
            if cur_frame > self.tracklets[tid]['last_frame'] and tid > -1:
                self.tracklets[tid]['box_3d'][:self.loc_dim] = self.tracklets[
                    tid]['box_3d'].new_tensor(
                        self.tracklets[tid]['tracker'].predict()
                        [:self.loc_dim])

        # Add backdrops
        backdrop_inds = torch.nonzero(ids == -1).squeeze(1)
        ious = bbox_overlaps(bboxes[backdrop_inds, :-1], bboxes[:, :-1])
        for i, ind in enumerate(backdrop_inds):
            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
                backdrop_inds[i] = -1
        backdrop_inds = backdrop_inds[backdrop_inds > -1]

        backdrop_tracker = [
            self.tracker_model(self.device, self.lstm,
                               boxes_3d[bd_ind].cpu().numpy(),
                               depth_uncertainty[bd_ind].cpu().numpy()) if
            self.tracker_model_name == 'LSTM3DTracker' else self.tracker_model(
                boxes_3d[bd_ind].cpu().numpy(),
                depth_uncertainty[bd_ind].cpu().numpy())
            for bd_ind in backdrop_inds
        ]

        self.backdrops.insert(
            0,
            dict(bboxes=bboxes[backdrop_inds],
                 boxes_3d=boxes_3d[backdrop_inds],
                 tracker=backdrop_tracker,
                 embeds=embeds[backdrop_inds],
                 labels=labels[backdrop_inds]))

        # pop memo
        invalid_ids = []
        for k, v in self.tracklets.items():
            if cur_frame - v['last_frame'] >= self.memo_tracklet_frames:
                invalid_ids.append(k)
        for invalid_id in invalid_ids:
            self.tracklets.pop(invalid_id)

        if len(self.backdrops) > self.memo_backdrop_frames:
            self.backdrops.pop()
    def cal_loss_embed(self, asso_probs, cos_probs, ids, id_weights,
                       ref_sampling_results, cfg):
        losses = dict()
        batch_size = len(ids)
        loss_asso = 0.
        loss_iou = 0.
        nelements = 0.
        # calculate per image loss
        for prob, cos_prob, cur_ids, cur_weights, res in zip(
                asso_probs, cos_probs, ids, id_weights, ref_sampling_results):
            valid_idx = torch.nonzero(cur_weights).squeeze()
            if len(valid_idx.size()) == 0:
                continue

            num_gt = 0
            num_pos = 0
            num_neg = 0
            pids = []
            for _res in res:
                ious = bbox_overlaps(_res.pos_bboxes, _res.gt_bboxes)
                pids.append(ious.max(dim=1)[1] + num_gt)
                num_gt += _res.gt_bboxes.size(0)
                num_pos += _res.pos_bboxes.size(0)
                if cfg.with_ref_neg:
                    num_neg += _res.neg_bboxes.size(0)
            assert num_gt + num_pos + num_neg == prob.size(1)

            pids.insert(0, torch.arange(num_gt).long().to(prob.device))
            if cfg.with_ref_neg:
                pids.append((torch.ones(num_neg).long() * -2).to(prob.device))
            pids = torch.cat(pids, dim=0)

            pos_inds = (cur_ids.view(-1, 1) == pids.view(1, -1)).float()
            neg_inds = (cur_ids.view(-1, 1) != pids.view(1, -1)).float()
            exp_pos = (torch.exp(-1 * prob) * pos_inds).sum(dim=1)
            exp_neg = (torch.exp(prob.clamp(max=80)) * neg_inds).sum(dim=1)
            loss = torch.log(1 + exp_pos * exp_neg)
            loss_asso += (loss * cur_weights).sum() / cur_weights.sum()

            if self.loss_iou is not None:
                dists = torch.abs(cos_prob - pos_inds)**2
                pos_points = torch.nonzero(pos_inds == 1)
                pos_dists = dists[pos_points[:, 0], pos_points[:, 1]]
                nelements += pos_dists.nelement()
                loss_iou += pos_dists.sum()
                # neg
                neg_inds = torch.nonzero(pos_inds == 0)
                if self.loss_iou['sample_ratio'] > -1:
                    num_negs = pos_dists.nelement(
                    ) * self.loss_iou['sample_ratio']
                    if len(neg_inds) < num_negs:
                        num_negs = len(neg_inds)
                else:
                    num_negs = len(neg_inds)
                nelements += num_negs
                if self.loss_iou['hard_mining']:
                    _loss_neg = dists[neg_inds[:, 0],
                                      neg_inds[:, 1]].topk(num_negs)[0]
                else:
                    neg_inds = self.random_choice(neg_inds, num_negs)
                    _loss_neg = dists[neg_inds[:, 0], neg_inds[:, 1]]
                if self.loss_iou['margin'] > 0:
                    _loss_neg *= (_loss_neg > self.loss_iou['margin']).float()
                loss_iou += _loss_neg.sum()

        # average
        losses['loss_asso'] = loss_asso / batch_size * self.loss_asso[
            'loss_weight']
        if self.loss_iou is not None:
            losses['loss_iou'] = (
                loss_iou / (nelements + 1e-6)) * self.loss_iou['loss_weight']
        return losses
Exemple #6
0
    def analyze(self,
                img_meta,
                bboxes,
                labels,
                ids,
                depths=None,
                dims=None,
                alphas=None,
                cen_2ds=None,
                show=False,
                save=False,
                gt_cats=None):
        gt_bboxes, gt_labels, gt_ids, gt_ignores, \
            gt_alphas, gt_rotys, gt_dims, gt_trans, gt_2dcs = self.loadGts(
                img_meta, gt_cats)
        track_inds = ids > -1
        track_bboxes = bboxes[track_inds]
        track_labels = labels[track_inds]
        if depths is not None:
            track_depths = depths[track_inds]
        else:
            track_depths = None
        if dims is not None:
            track_dims = dims[track_inds]
        else:
            track_dims = None
        if alphas is not None:
            track_alphas = alphas[track_inds]
        else:
            track_alphas = None
        if cen_2ds is not None:
            track_2dcs = cen_2ds[track_inds]
        else:
            track_2dcs = None
        track_ids = ids[track_inds]
        if len(gt_ignores) > 0:
            ignore_inds = (bbox_overlaps(bboxes[:, :4], gt_ignores, mode='iof')
                           > 0.5).any(dim=1)
        if track_bboxes.size(0) == 0:
            self.counter.num_fn += gt_bboxes.size(0)
            return
        if gt_bboxes.size(0) == 0:
            self.counter.num_fp += track_bboxes.size(0)
            if gt_ignores.size(0) > 0:
                self.counter.num_fp -= ignore_inds[track_inds].sum()
            return
        # init
        # [N, 6]: [x1, y1, x2, y2, class, id]
        self.counter.num_gt += gt_bboxes.size(0)
        fps = torch.ones(bboxes.size(0), dtype=torch.long)
        fns = torch.ones(gt_bboxes.size(0), dtype=torch.long)
        # false negatives after tracking filter
        track_fns = torch.ones(gt_bboxes.size(0), dtype=torch.long)
        idsw = torch.zeros(track_ids.size(0), dtype=torch.long)

        # fp & fn for raw detection results
        ious = bbox_overlaps(bboxes[:, :4], gt_bboxes[:, :4])
        same_cat = labels.view(-1, 1) == gt_labels.view(1, -1)
        ious *= same_cat.float()
        max_ious, gt_inds = ious.max(dim=1)
        _, dt_inds = bboxes[:, -1].sort(descending=True)
        for dt_ind in dt_inds:
            iou, gt_ind = max_ious[dt_ind], gt_inds[dt_ind]
            if iou > 0.5 and fns[gt_ind] == 1:
                fns[gt_ind] = 0
                if ids[dt_ind] > -1:
                    track_fns[gt_ind] = 0
                gt_bboxes[gt_ind, 4] = bboxes[dt_ind, -1]
                fps[dt_ind] = 0
            else:
                if len(gt_ignores) > 0 and ignore_inds[dt_ind]:
                    fps[dt_ind] = 0
                    gt_inds[dt_ind] = -2
                else:
                    gt_inds[dt_ind] = -1

        track_gt_inds = gt_inds[track_inds]
        track_fps = fps[track_inds]

        for i, tid in enumerate(track_ids):
            tid = int(tid)
            gt_ind = track_gt_inds[i]
            if gt_ind == -1 or gt_ind == -2:
                continue
            gt_id = int(gt_ids[gt_ind])
            if gt_id in self.id_maps.keys() and self.id_maps[gt_id] != tid:
                idsw[i] = 1
            if gt_id not in self.id_maps.keys() and tid in self.id_maps.values(
            ):
                idsw[i] = 1
            self.id_maps[gt_id] = tid

        fp_inds = track_fps == 1
        fn_inds = track_fns == 1
        idsw_inds = idsw == 1
        self.counter.num_fp += fp_inds.sum()
        self.counter.num_fn += fn_inds.sum()
        self.counter.num_idsw += idsw_inds.sum()

        if show or save:
            vid_name = os.path.dirname(
                img_meta[0]['img_info']['file_name']).split('/')[-1]
            img_name = os.path.basename(img_meta[0]['img_info']['file_name'])
            # img = os.path.join(
            #     self.data.img_prefix[img_meta[0]['img_info']['type']],
            #     vid_name, img_name)
            img = img_meta[0]['img_info']['file_name']
            save_path = os.path.join(self.out, 'analysis', vid_name)
            os.makedirs(save_path, exist_ok=True)
            save_file = os.path.join(save_path, img_name) if save else None
            img = imshow_3d_tracklets(img,
                                      track_bboxes[fp_inds].numpy(),
                                      track_labels[fp_inds].numpy(),
                                      depths=track_depths[fp_inds].numpy()
                                      if depths is not None else None,
                                      cen_2d=track_2dcs[fp_inds].numpy()
                                      if cen_2ds is not None else None,
                                      ids=track_ids[fp_inds].numpy(),
                                      color='red',
                                      show=False)
            img = imshow_3d_tracklets(img,
                                      gt_bboxes[fn_inds, :].numpy(),
                                      gt_labels[fn_inds].numpy(),
                                      depths=gt_trans[fn_inds, -1].numpy(),
                                      cen_2d=gt_2dcs[fn_inds, -1].numpy(),
                                      ids=gt_ids[fn_inds].numpy(),
                                      color='yellow',
                                      show=False)
            img = imshow_3d_tracklets(img,
                                      track_bboxes[idsw_inds].numpy(),
                                      track_labels[idsw_inds].numpy(),
                                      depths=track_depths[idsw_inds].numpy()
                                      if depths is not None else None,
                                      cen_2d=track_2dcs[idsw_inds].numpy()
                                      if cen_2ds is not None else None,
                                      ids=track_ids[idsw_inds].numpy(),
                                      color='cyan',
                                      show=show,
                                      out_file=save_file)