コード例 #1
0
def compute_bbox_regression_targets(rois, overlaps, labels, cfg):
    """
    given rois, overlaps, gt labels, compute bounding box regression targets
    :param rois: roidb[i]['boxes'] k * 4
    :param overlaps: roidb[i]['max_overlaps'] k * 1
    :param labels: roidb[i]['max_classes'] k * 1
    :return: targets[i][class, dx, dy, dw, dh] k * 5
    """
    # Ensure ROIs are floats
    rois = rois.astype(np.float, copy=False)

    # Sanity check
    if len(rois) != len(overlaps):
        print 'bbox regression: this should not happen'

    # Indices of ground-truth ROIs
    gt_inds = np.where(overlaps == 1)[0]
    if len(gt_inds) == 0:
        print 'something wrong : zero ground truth rois'
    # Indices of examples for which we try to make predictions
    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_REGRESSION_THRESH)[0]

    # Get IoU overlap between each ex ROI and gt ROI
    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])

    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    gt_assignment = ex_gt_overlaps.argmax(axis=1)
    gt_rois = rois[gt_inds[gt_assignment], :]
    ex_rois = rois[ex_inds, :]

    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    targets[ex_inds, 0] = labels[ex_inds]
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
    return targets
コード例 #2
0
def repgt(pred_boxes, gt_rois, rois_inside_ws):

    sigma_repgt = 0.9
    loss_repgt = torch.zeros(pred_boxes.shape[0]).cuda()
    for i in range(pred_boxes.shape[0]):
        boxes = Variable(pred_boxes[i, rois_inside_ws[i] != 0].view(
            int(pred_boxes[i, rois_inside_ws[i] != 0].shape[0]) / 4, 4))
        gt = Variable(gt_rois[i, rois_inside_ws[i] != 0].view(
            int(gt_rois[i, rois_inside_ws[i] != 0].shape[0]) / 4, 4))
        num_repgt = 0
        repgt_smoothln = 0
        if boxes.shape[0] > 0:
            overlaps = bbox_overlaps(boxes, gt)
            for j in range(overlaps.shape[0]):
                for z in range(overlaps.shape[1]):
                    if int(torch.sum(gt[j] == gt[z])) == 4:
                        overlaps[j, z] = 0
            max_overlaps, argmax_overlaps = torch.max(overlaps, 1)
            for j in range(max_overlaps.shape[0]):
                if max_overlaps[j] > 0:
                    num_repgt += 1
                    iog = IoG(boxes[j], gt[argmax_overlaps[j]])
                    if iog > sigma_repgt:
                        repgt_smoothln += ((iog - sigma_repgt) /
                                           (1 - sigma_repgt) -
                                           math.log(1 - sigma_repgt))
                    elif iog <= sigma_repgt:
                        repgt_smoothln += -math.log(1 - iog)
        if num_repgt > 0:
            loss_repgt[i] = repgt_smoothln / num_repgt

    return loss_repgt
コード例 #3
0
def compute_bbox_regression_targets(rois, overlaps, labels, cfg):
    """
    given rois, overlaps, gt labels, compute bounding box regression targets
    :param rois: roidb[i]['boxes'] k * 4
    :param overlaps: roidb[i]['max_overlaps'] k * 1
    :param labels: roidb[i]['max_classes'] k * 1
    :return: targets[i][class, dx, dy, dw, dh] k * 5
    """
    # Ensure ROIs are floats
    rois = rois.astype(np.float, copy=False)

    # Sanity check
    if len(rois) != len(overlaps):
        print 'bbox regression: this should not happen'

    # Indices of ground-truth ROIs
    gt_inds = np.where(overlaps == 1)[0]
    if len(gt_inds) == 0:
        print 'something wrong : zero ground truth rois'
    # Indices of examples for which we try to make predictions
    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_REGRESSION_THRESH)[0]

    # Get IoU overlap between each ex ROI and gt ROI
    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])

    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    gt_assignment = ex_gt_overlaps.argmax(axis=1)
    gt_rois = rois[gt_inds[gt_assignment], :]
    ex_rois = rois[ex_inds, :]

    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    targets[ex_inds, 0] = labels[ex_inds]
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
    return targets
コード例 #4
0
def RepBox(pred_boxes, gt_boxes):
    sigma_repbox = 0
    loss_repbox = torch.zeros(pred_boxes.shape[0]).cuda()

    for i in range(pred_boxes.shape[0]):

        pred_box = pred_boxes[i]
        gt_box = gt_boxes[i]

        num_repbox = 0
        repbox_smoothln = 0
        if pred_box.shape[0] > 0:
            overlaps = bbox_overlaps(pred_box, pred_box)
            for j in range(overlaps.shape[0]):
                for z in range(overlaps.shape[1]):
                    if z >= j:
                        overlaps[j, z] = 0
                    elif int(torch.sum(gt_box[j] == gt_box[z])) == 4:
                        overlaps[j, z] = 0

            iou = overlaps[overlaps > 0]
            for j in range(iou.shape[0]):
                num_repbox += 1
                if iou[j] <= sigma_repbox:
                    repbox_smoothln += -math.log(1 - iou[j])
                elif iou[j] > sigma_repbox:
                    repbox_smoothln += ((iou[j] - sigma_repbox) /
                                        (1 - sigma_repbox) -
                                        math.log(1 - sigma_repbox))

        if num_repbox > 0:
            loss_repbox[i] = repbox_smoothln / num_repbox

    return loss_repbox
コード例 #5
0
def RepGT(pred_boxes, gt_boxes):  # B, G   #, rois_inside_ws

    sigma_repgt = 0.9
    loss_repgt = torch.zeros(pred_boxes.shape[0]).cuda()
    for i in range(pred_boxes.shape[0]):
        pred_box = pred_boxes[i]
        gt_box = gt_boxes[i]

        num_repgt = 0
        repgt_smoothln = 0

        if pred_box.shape[0] > 0:
            overlaps = bbox_overlaps(pred_box, gt_box)
            for j in range(overlaps.shape[0]):
                for z in range(overlaps.shape[1]):
                    if int(torch.sum(gt_box[j] == gt_box[z])) == 4:
                        overlaps[j, z] = 0
            max_overlaps, argmax_overlaps = torch.max(overlaps, 1)
            for j in range(max_overlaps.shape[0]):
                if max_overlaps[j] > 0:
                    num_repgt += 1
                    iog = IoG(pred_box[j], gt_box[argmax_overlaps[j]])  # G, P

                    if iog > sigma_repgt:
                        repgt_smoothln += ((iog - sigma_repgt) /
                                           (1 - sigma_repgt) -
                                           math.log(1 - sigma_repgt))
                    elif iog <= sigma_repgt:
                        repgt_smoothln += -math.log(1 - iog)
        if num_repgt > 0:
            loss_repgt[i] = repgt_smoothln / num_repgt

    return loss_repgt
コード例 #6
0
def compute_bbox_regression_targets(rois, overlaps, labels):
    """
    given rois, overlaps, gt labels, compute bounding box regression targets
    :param rois: roidb[i]['boxes'] k * 4
    :param overlaps: roidb[i]['max_overlaps'] k * 1
    :param labels: roidb[i]['max_classes'] k * 1
    :return: targets[i][class, dx, dy, dw, dh] k * 5
    """
    # Ensure ROIs are floats
    rois = rois.astype(np.float, copy=False)

    # 完整性检查
    # Sanity check
    if len(rois) != len(overlaps):
        logger.warning('bbox regression: len(rois) != len(overlaps)')

    # 获取真实区域的索引
    # Indices of ground-truth ROIs
    gt_inds = np.where(overlaps == 1)[0]
    # 如果无真实区域
    if len(gt_inds) == 0:
        logger.warning('bbox regression: len(gt_inds) == 0')

    # Indices of examples for which we try to make predictions
    ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]

    # Get IoU overlap between each ex ROI and gt ROI
    ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :])

    # Find which gt ROI each ex ROI has max overlap with:
    # this will be the ex ROI's gt target
    # 找到每一个ex roi 对应的 gt target
    gt_assignment = ex_gt_overlaps.argmax(axis=1)
    gt_rois = rois[gt_inds[gt_assignment], :]
    ex_rois = rois[ex_inds, :]

    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
    # 打上gt target 的标签
    targets[ex_inds, 0] = labels[ex_inds]
    # 得到偏移量
    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
    return targets
コード例 #7
0
def repbox(pred_boxes, gt_rois, rois_inside_ws):

    sigma_repbox = 0
    loss_repbox = torch.zeros(pred_boxes.shape[0]).cuda()

    for i in range(pred_boxes.shape[0]):

        boxes = Variable(pred_boxes[i, rois_inside_ws[i] != 0].view(
            int(pred_boxes[i, rois_inside_ws[i] != 0].shape[0]) / 4, 4))
        gt = Variable(gt_rois[i, rois_inside_ws[i] != 0].view(
            int(gt_rois[i, rois_inside_ws[i] != 0].shape[0]) / 4, 4))

        num_repbox = 0
        repbox_smoothln = 0
        if boxes.shape[0] > 0:
            overlaps = bbox_overlaps(boxes, boxes)
            for j in range(overlaps.shape[0]):
                for z in range(overlaps.shape[1]):
                    if z >= j:
                        overlaps[j, z] = 0
                    elif int(torch.sum(gt[j] == gt[z])) == 4:
                        overlaps[j, z] = 0

            iou = overlaps[overlaps > 0]
            for j in range(iou.shape[0]):
                num_repbox += 1
                if iou[j] <= sigma_repbox:
                    repbox_smoothln += -math.log(1 - iou[j])
                elif iou[j] > sigma_repbox:
                    repbox_smoothln += ((iou[j] - sigma_repbox) /
                                        (1 - sigma_repbox) -
                                        math.log(1 - sigma_repbox))

        if num_repbox > 0:
            loss_repbox[i] = repbox_smoothln / num_repbox

    return loss_repbox
コード例 #8
0
        def _sample_rois(al_rois, al_scores, tr_param, gt_box, fg_rois_per_im,
                         rois_per_im, num_classes, num_pid):
            """Generate a random sample of RoIs comprising foreground and
            background examples.
            """
            # overlaps: (rois x gt_boxes)
            overlaps = bbox_overlaps(al_rois[:, 1:5].data, gt_box[:, :4].data)
            max_overlaps, gt_assignment = overlaps.max(1)
            label = gt_box[gt_assignment, [4]]

            # Select foreground RoIs as those with >= FG_THRESH overlap
            fg_inds = (max_overlaps >=
                       self.config['train_fg_thresh']).nonzero().view(-1)
            # Guard against when an image has fewer than fg_rois_per_image

            # # ========================added=======================
            # # foreground RoIs
            # fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size(0))
            # # Sample foreground regions without replacement
            # if fg_inds.size(0) > 0:
            #   fg_inds = fg_inds[torch.from_numpy(
            #     npr.choice(np.arange(0, fg_inds.numel()), size=int(
            # fg_rois_per_this_image), replace=False)).long().cuda()]
            # # ====================================================

            # Select bg RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
            bg_inds = ((max_overlaps < self.config['train_bg_thresh_hi']) +
                       (max_overlaps >= self.config['train_bg_thresh_lo']) == 2
                       ).nonzero().view(-1)

            # =========================origin==========================
            # Small modification to the original version where we ensure a
            # fixed number of regions are sampled
            if fg_inds.numel() > 0 and bg_inds.numel() > 0:
                fg_rois_per_im = min(fg_rois_per_im, fg_inds.numel())

                if gt_box.size(0) < fg_rois_per_im:
                    gt_inds = torch.from_numpy(np.arange(
                        0, gt_box.size(0))).long().cuda()
                    fg_inds = torch.cat((gt_inds, fg_inds[torch.from_numpy(
                        npr.choice(np.arange(gt_box.size(0), fg_inds.numel()),
                                   size=int(fg_rois_per_im) - gt_box.size(0),
                                   replace=False)).long().cuda()]))
                else:
                    lab_inds = (gt_box[:, 5] != -1).nonzero().squeeze().data
                    if -1 in gt_box[:, 5].data:
                        unlab_inds = (gt_box[:,
                                             5] == -1).nonzero().squeeze().data
                        fg_inds = torch.cat(
                            (lab_inds,
                             torch.from_numpy(
                                 npr.choice(unlab_inds.cpu().numpy(),
                                            size=fg_rois_per_im -
                                            lab_inds.numel(),
                                            replace=False)).long().cuda()))
                    else:
                        fg_inds = lab_inds

                # # ======================original========================
                # fg_inds = fg_inds[torch.from_numpy(
                #     npr.choice(np.arange(0, fg_inds.numel()),
                #                size=int(fg_rois_per_im),
                #                replace=False)).long().cuda()]
                # fg_inds = torch.from_numpy(
                #     (np.sort(fg_inds.cpu().numpy()))).long().cuda()

                bg_rois_per_im = rois_per_im - fg_rois_per_im
                to_replace = bg_inds.numel() < bg_rois_per_im
                bg_inds = bg_inds[torch.from_numpy(
                    npr.choice(np.arange(0, bg_inds.numel()),
                               size=int(bg_rois_per_im),
                               replace=to_replace)).long().cuda()]
            elif fg_inds.numel() > 0:
                to_replace = fg_inds.numel() < rois_per_im
                fg_inds = fg_inds[torch.from_numpy(
                    npr.choice(np.arange(0, fg_inds.numel()),
                               size=int(rois_per_im),
                               replace=to_replace)).long().cuda()]
                fg_rois_per_im = rois_per_im
            elif bg_inds.numel() > 0:
                to_replace = bg_inds.numel() < rois_per_im
                bg_inds = bg_inds[torch.from_numpy(
                    npr.choice(np.arange(0, bg_inds.numel()),
                               size=int(rois_per_im),
                               replace=to_replace)).long().cuda()]
                fg_rois_per_im = 0
            else:
                import pdb
                pdb.set_trace()

            # # ====================rectify========================
            # # Compute number of background RoIs to take from this image
            # # (guarding against there being fewer than desired)
            # bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
            # bg_rois_per_this_image = min(bg_rois_per_this_image,
            # bg_inds.size(0))
            # # Sample background regions without replacement
            # if bg_inds.size(0) > 0:
            #   bg_inds = bg_inds[torch.from_numpy(
            #     npr.choice(np.arange(0, bg_inds.numel()),
            # size=int(bg_rois_per_this_image), replace=False)).long().cuda()]

            # The indices that we're selecting (both fg and bg)
            if not isinstance(fg_inds, torch.cuda.LongTensor):
                print(fg_inds, type(fg_inds))
            keep_inds = torch.cat([fg_inds, bg_inds], 0)
            # Select sampled values from various arrays:
            label = label[keep_inds].contiguous()
            # Clamp labels for the background RoIs to 0
            label[int(fg_rois_per_im):] = 0
            roi = al_rois[keep_inds].contiguous()
            roi_score = al_scores[keep_inds].contiguous()
            tr_param = tr_param[keep_inds].contiguous()

            p_label = None
            if gt_box.size(1) > 5:
                p_label = gt_box[gt_assignment, [5]]
                p_label = p_label[keep_inds].contiguous()
                p_label[fg_rois_per_im:] = num_pid

            bbox_target_data = _compute_targets(
                roi[:, 1:5].data, gt_box[gt_assignment[keep_inds]][:, :4].data,
                label.data)

            bbox_tar, bbox_in_weights = _get_bbox_regression_labels(
                bbox_target_data, num_classes)

            return label, roi, roi_score, bbox_tar, bbox_in_weights, p_label, \
                   tr_param
コード例 #9
0
    def anchor_target_layer(self, rpn_cls_score, gt_boxes, im_info):
        def _unmap(data, count, inds, fill=0):
            """
            Unmap a subset of item (data) back to the original set of items
            (of size count)
            """
            if len(data.shape) == 1:
                ret = np.empty((count, ), dtype=np.float32)
                ret.fill(fill)
                ret[inds] = data
            else:
                ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
                ret.fill(fill)
                ret[inds, :] = data
            return ret

        def _compute_targets(ex_rois, gt_rois):
            """Compute bounding-box regression targets for an image."""

            assert ex_rois.shape[0] == gt_rois.shape[0]
            assert ex_rois.shape[1] == 4
            assert gt_rois.shape[1] >= 5

            # add float convert
            return bbox_transform(torch.from_numpy(ex_rois),
                                  torch.from_numpy(gt_rois[:, :4])).numpy()

        all_anchors = self.anchors.data.cpu().numpy()
        gt_boxes = gt_boxes.data.cpu().numpy()
        rpn_cls_score = rpn_cls_score.data

        A = self.num_anchors
        total_anchors = all_anchors.shape[0]
        K = total_anchors / self.num_anchors

        # allow boxes to sit over the edge by a small amount
        _allowed_border = 0

        # map of shape (..., H, W)
        height, width = rpn_cls_score.shape[1:3]

        # only keep anchors inside the image
        inds_inside = np.where(
            (all_anchors[:, 0] >= -_allowed_border)
            & (all_anchors[:, 1] >= -_allowed_border)
            & (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
            (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
        )[0]

        # keep only inside anchors
        anchors = all_anchors[inds_inside, :]

        # label: 1 is positive, 0 is negative, -1 is dont care
        labels = np.empty((len(inds_inside), ), dtype=np.float32)
        labels.fill(-1)

        # overlaps between the anchors and the gt boxes
        # overlaps (ex, gt)
        overlaps = bbox_overlaps(
            np.ascontiguousarray(anchors, dtype=np.float),
            np.ascontiguousarray(gt_boxes, dtype=np.float))
        argmax_overlaps = overlaps.argmax(axis=1)
        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
        gt_argmax_overlaps = overlaps.argmax(axis=0)
        gt_max_overlaps = overlaps[gt_argmax_overlaps,
                                   np.arange(overlaps.shape[1])]
        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
        if not self.config['train_rpn_clobber_positive']:
            # assign bg labels first so that positive labels can clobber them
            # first set the negatives
            labels[max_overlaps < self.config['train_rpn_neg_overlap']] = 0

            # fg label: for each gt, anchor with highest overlap
        labels[gt_argmax_overlaps] = 1

        # fg label: above threshold IOU
        labels[max_overlaps >= self.config['train_rpn_pos_overlap']] = 1

        if self.config['train_rpn_clobber_positive']:
            # assign bg labels last so that negative labels can clobber pos
            labels[max_overlaps < self.config['train_rpn_neg_overlap']] = 0

        # subsample positive labels if we have too many
        num_fg = int(self.config['train_rpn_fg_frac'] *
                     self.config['train_rpn_batchsize'])
        fg_inds = np.where(labels == 1)[0]
        if len(fg_inds) > num_fg:
            disable_inds = npr.choice(fg_inds,
                                      size=(len(fg_inds) - num_fg),
                                      replace=False)
            labels[disable_inds] = -1

        # subsample negative labels if we have too many
        num_bg = self.config['train_rpn_batchsize'] - np.sum(labels == 1)
        bg_inds = np.where(labels == 0)[0]
        if len(bg_inds) > num_bg:
            disable_inds = npr.choice(bg_inds,
                                      size=(len(bg_inds) - num_bg),
                                      replace=False)
            labels[disable_inds] = -1

        bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
        bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])

        bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
        # only the positive ones have regression targets
        bbox_inside_weights[labels == 1, :] = np.array(
            self.config['train_rpn_bbox_inside_weights'])

        bbox_outside_weights = np.zeros((len(inds_inside), 4),
                                        dtype=np.float32)
        if self.config['train_rpn_pos_weight'] < 0:
            # uniform weighting of examples (given non-uniform sampling)
            num_examples = np.sum(labels >= 0)
            positive_weights = np.ones((1, 4)) * 1.0 / num_examples
            negative_weights = np.ones((1, 4)) * 1.0 / num_examples
        else:
            assert ((self.config['train_rpn_pos_weight'] > 0) &
                    (self.config['train_rpn_pos_weight'] < 1))
            positive_weights = (self.config['train_rpn_pos_weight'] /
                                np.sum(labels == 1))
            negative_weights = ((1.0 - self.config['train_rpn_pos_weight']) /
                                np.sum(labels == 0))
        bbox_outside_weights[labels == 1, :] = positive_weights
        bbox_outside_weights[labels == 0, :] = negative_weights

        # map up to original set of anchors
        labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
        bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
        bbox_inside_weights = _unmap(bbox_inside_weights,
                                     total_anchors,
                                     inds_inside,
                                     fill=0)
        bbox_outside_weights = _unmap(bbox_outside_weights,
                                      total_anchors,
                                      inds_inside,
                                      fill=0)

        # labels
        labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
        labels = labels.reshape((1, 1, A * height, width))
        rpn_labels = Variable(torch.from_numpy(labels).float().cuda()).long()

        # bbox_targets
        bbox_targets = bbox_targets.reshape((1, height, width, A * 4))

        rpn_bbox_targets = Variable(
            torch.from_numpy(bbox_targets).float().cuda())
        # bbox_inside_weights
        bbox_inside_weights = bbox_inside_weights.reshape(
            (1, height, width, A * 4))
        rpn_bbox_inside_weights = Variable(
            torch.from_numpy(bbox_inside_weights).float().cuda())

        # bbox_outside_weights
        bbox_outside_weights = bbox_outside_weights.reshape(
            (1, height, width, A * 4))
        rpn_bbox_outside_weights = Variable(
            torch.from_numpy(bbox_outside_weights).float().cuda())

        return rpn_labels, (rpn_bbox_targets, rpn_bbox_inside_weights,
                            rpn_bbox_outside_weights)
コード例 #10
0
    def forward(self, anchors, gt_boxes, act_lens):
        # here the anchors should be the anchors for each utterance, because 
        # when we calculate the training target (before RPN )  for each
        # utterance, the anchors are exactly the same (different from proposals)
        batch_size = gt_boxes.size(0)
        num_anchors_per_utt = anchors.size(0)
        rpn_labels = gt_boxes.new(batch_size, anchors.size(0)).fill_(-1)
        bbox_inside_weights = gt_boxes.new(batch_size, anchors.size(0)).zero_()
        bbox_outside_weights = gt_boxes.new(batch_size, anchors.size(0)).zero_()
        
        overlaps = bbox_overlaps(anchors, gt_boxes[:,:, 1:]) 
        # batch_size * num_anchors_per_utt * num_gt_boxes
        max_overlaps, argmax_overlaps = torch.max(overlaps, 2)
        # For each anchor, we will find a max gt_boxes as its 
        # potential training target

        # fg label
        rpn_labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1

        # bg label 
        rpn_labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP ] = 0

        # disable the anchors out of utterance
        disable_indexes = get_out_utt_boxes(anchors, act_lens, batch_size)
        rpn_labels[disable_indexes] = -1

        num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
        sum_fg = torch.sum((rpn_labels == 1).int(), 1)
        sum_bg = torch.sum((rpn_labels == 0).int(), 1)

        for i in range(batch_size):
            # subsample positive labels if we have too man
            fg_inds = torch.nonzero(rpn_labels[i] == 1).view(-1)
            bg_inds = torch.nonzero(rpn_labels[i] == 0).view(-1)
            if fg_inds.size(0) > 0:
                rpn_labels[i][fg_inds] = torch.index_select(gt_boxes[i], 0, argmax_overlaps[i][fg_inds])[:, 0]
            if sum_fg[i] > num_fg:
                rand_num = torch.from_numpy(np.random.permutation(fg_inds.size(0))).type_as(gt_boxes).long()
                disable_inds = fg_inds[rand_num[:fg_inds.size(0)-num_fg]]
                rpn_labels[i][disable_inds] = -1


            num_bg = int(cfg.TRAIN.RPN_BATCHSIZE - torch.sum((rpn_labels[i] == 1).int()))
            if sum_bg[i] > num_bg:
                rand_num = torch.from_numpy(np.random.permutation(bg_inds.size(0))).type_as(gt_boxes).long()
                disable_inds = bg_inds[rand_num[:bg_inds.size(0)-num_bg]] 
                rpn_labels[i][disable_inds] = -1

        bbox_inside_weights[rpn_labels > 0] = cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS
        bbox_inside_weights = bbox_inside_weights.view(batch_size, num_anchors_per_utt, 1).expand(batch_size, num_anchors_per_utt, 2)
        num_positive = torch.sum(rpn_labels > 0)
        num_negative = torch.sum(rpn_labels == 0)
        if cfg.DEBUG:
            print('Num positive samples: {}, num negative samples: {}'.format(num_positive, num_negative))
        if num_positive < 1:
            num_positive += 1
        positive_weights = 1.0 / num_positive.item()
        negative_weights = 1.0 / num_positive.item()
        bbox_outside_weights[rpn_labels > 0] = positive_weights
        bbox_outside_weights[rpn_labels == 0] = negative_weights
        bbox_outside_weights = bbox_outside_weights.view(batch_size, num_anchors_per_utt, 1).expand(batch_size, num_anchors_per_utt,2)
        # compute bbox regression target of anchors
        
        # here for each utterance in the batch, we only choose the best matching 
        # gt_box to calculate the bbox_targets for each anchor
        offset = torch.arange(0, batch_size) * gt_boxes.size(1)
        argmax_overlaps = argmax_overlaps + offset.view(batch_size, 1).type_as(argmax_overlaps)
        rpn_targets = bbox_transform_batch(anchors, gt_boxes[:,:,1:].view(-1,2)[argmax_overlaps.view(-1), :].view(batch_size, -1, 2)) # num_anchors * num_gt_boxes
        return rpn_labels, rpn_targets, bbox_inside_weights, bbox_outside_weights