Exemple #1
0
def _sample_pairs(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """

    sampled_obj_boxes = roidb['obj_boxes']
    sampled_obj_rois = sampled_obj_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_obj_boxes.shape[0], 1))
    sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois))

    blob_dict = {}
    blob_dict['obj_rois'] = sampled_obj_rois

    sampled_rel_rois = sampled_obj_rois

    blob_dict['rel_rois'] = sampled_rel_rois

    prd_gt_cls = np.zeros(1, dtype=np.int32)
    prd_gt_cls[0] = roidb['prd_gt_cls']
    blob_dict['prd_gt_cls'] = prd_gt_cls

    obj_gt_cls = np.zeros(1, dtype=np.int32)
    obj_gt_cls[0] = roidb['obj_gt_cls']
    blob_dict['obj_gt_cls'] = obj_gt_cls

    return blob_dict
Exemple #2
0
def add_prn_blobs(blobs_out, blobs_in):
    """ Add PRN specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    num_cls = cfg.MODEL.NUM_CLASSES
    iou_thres = cfg.PRN.IOU_THRESHOLD

    fg_inds = np.where(blobs_in['labels_int32'] > 0)[0]
    if fg_inds.shape[0] > 0:
        # Class labels for the foreground rois
        fg_labels = blobs_in['labels_int32'][fg_inds]
        # if below threshold, then set labels to 1, otherwise 0
        prn_labels = (blobs_in['mask_ious'] < iou_thres).astype(np.int32)
        # and set roi_needs_refine same as prn_labels
        roi_needs_refine = (blobs_in['mask_ious'] < iou_thres).astype(np.int32)
        # calculate refine ratio
        refine_ratio = np.sum(roi_needs_refine,
                              keepdims=True).astype(np.float32)
        refine_ratio /= fg_inds.shape[0]
        # sometimes the prn_labels might be all false, but we still need
        # a non-all-false roi_needs_refine. So set the first one as True
        if np.sum(roi_needs_refine) == 0:
            roi_needs_refine[0] = 1

    else:  # If there are no fg masks (it does happen)
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).
        bg_inds = np.where(blobs_in['labels_int32'] == 0)[0]
        # We give it an -1's blob (ignore label)
        prn_labels = -blob_utils.ones((1, ), int32=True)
        # We label it with class = 0 (background)
        fg_labels = blob_utils.zeros((1, ))
        # and set roi_needs_refine to be 1
        roi_needs_refine = blob_utils.ones((1, ), int32=True)
        # set refine_ratio to be 0
        refine_ratio = blob_utils.zeros((1, ))

    if cfg.PRN.CLS_SPECIFIC_LABEL:
        prn_labels = _expand_to_class_specific_prn_targets(
            prn_labels, fg_labels)

    blobs_out['prn_labels_int32'] = prn_labels
    blobs_out['roi_needs_refine_int32'] = roi_needs_refine
    blobs_out['refine_ratio'] = refine_ratio
Exemple #3
0
def add_refine_keypoints_blobs_gaussian(blobs, roidb, fg_rois_per_image,
                                        fg_inds, im_scale, batch_idx, data):
    """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary."""
    # Note: gt_inds must match how they're computed in
    # datasets.json_dataset._merge_proposal_boxes_into_roidb
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    gt_keypoints = roidb['gt_keypoints']
    # Load the kp_fg_inds generated by keypoint_rcnn.py. So we avoid the issue
    # of mismatched keypoint_rois and refined_keypoint_rois, which cause a big
    # issue for training.
    kp_fg_inds = blobs['keypoint_fg_inds']
    if kp_fg_inds.shape[0] > 0:
        sampled_fg_rois = roidb['boxes'][kp_fg_inds]
        box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

        # Let's expand the rois
        up_scale = cfg.REFINENET.UP_SCALE
        inp_h, inp_w = data.shape[2], data.shape[3]
        pad_img_h, pad_img_w = inp_h / im_scale, inp_w / im_scale

        pad_fg_rois = box_utils.expand_boxes(sampled_fg_rois, up_scale)
        pad_fg_rois = box_utils.clip_boxes_to_image(pad_fg_rois, pad_img_h,
                                                    pad_img_w)

        num_keypoints = gt_keypoints.shape[2]
        sampled_keypoints = -np.ones(
            (len(pad_fg_rois), gt_keypoints.shape[1], num_keypoints),
            dtype=gt_keypoints.dtype)
        for ii in range(len(pad_fg_rois)):
            ind = box_to_gt_ind_map[ii]
            if ind >= 0:
                sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
                assert np.sum(sampled_keypoints[ii, 2, :]) > 0

        heats, weights = keypoint_utils.keypoints_to_gaussian_heatmap_labels(
            sampled_keypoints, pad_fg_rois, M=cfg.REFINENET.KRCNN.HEATMAP_SIZE)

    else:  # If there are no fg keypoint rois (it does happen)
        # The network cannot handle empty blobs, so we must provide a heatmap
        # We simply take the first bg roi, given it an all zero heatmap, and
        # set its weights to zero (ignore label).
        roi_inds = np.where(roidb['gt_classes'] == 0)[0]
        # sampled_fg_rois is actually one random roi, but that's ok because ...
        pad_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1))
        # We give it an 0's blob
        M = cfg.REFINENET.KRCNN.HEATMAP_SIZE
        heats = blob_utils.zeros((1, cfg.KRCNN.NUM_KEYPOINTS, M, M))
        # We set weights to 0 (ignore label)
        weights = blob_utils.zeros((1, cfg.KRCNN.NUM_KEYPOINTS, 1))

    pad_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((pad_fg_rois.shape[0], 1))
    pad_fg_rois = np.hstack((repeated_batch_idx, pad_fg_rois))

    blobs['refined_keypoint_rois'] = pad_fg_rois
    blobs['refined_keypoint_heatmaps'] = heats
    blobs['refined_keypoint_weights'] = weights
Exemple #4
0
def add_keypoint_rcnn_blobs(
    blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
):
    """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary."""
    # Note: gt_inds must match how they're computed in
    # datasets.json_dataset._merge_proposal_boxes_into_roidb
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    max_overlaps = roidb['max_overlaps']
    gt_keypoints = roidb['gt_keypoints']

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
    kp_fg_inds = np.where(
        np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible)
    )[0]

    kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(
            kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False
        )

    sampled_fg_rois = roidb['boxes'][kp_fg_inds]
    box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

    num_keypoints = gt_keypoints.shape[2]
    sampled_keypoints = -np.ones(
        (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints),
        dtype=gt_keypoints.dtype
    )
    for ii in range(len(sampled_fg_rois)):
        ind = box_to_gt_ind_map[ii]
        if ind >= 0:
            sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
            assert np.sum(sampled_keypoints[ii, 2, :]) > 0

    heats, weights = keypoint_utils.keypoints_to_heatmap_labels(
        sampled_keypoints, sampled_fg_rois
    )

    shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1)
    heats = heats.reshape(shape)
    weights = weights.reshape(shape)

    sampled_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1)
    )
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))

    blobs['keypoint_rois'] = sampled_fg_rois
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights
Exemple #5
0
def _expand_to_class_specific_prn_targets(prn_labels, class_labels):
    """Expand labels from shape (#rois, ) to (#rois, #classes )
    to encode class specific mask targets.
    """
    assert prn_labels.shape[0] == class_labels.shape[0]

    # Target values of -1 are "don't care" / ignore labels
    prn_targets = -blob_utils.ones(
        (prn_labels.shape[0], cfg.MODEL.NUM_CLASSES), int32=True)
    prn_targets[np.arange(prn_labels.shape[0]), class_labels] = prn_labels

    return prn_targets
def add_semantic_segms_blobs(blobs, roidb, im_scale, batch_idx, data):
    """ Add Semantic Segmentation Net specidfic blobs to the input blob
        dictionary. Draw all gt polygons to the label
    """
    num_cls = cfg.MODEL.NUM_CLASSES
    rescale_factor = cfg.SEMANTIC_NET.RESCALE_FACTOR
    polys_gt_inds = np.where((roidb['gt_classes'] > 0)
                             & (roidb['is_crowd'] == 0))[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]

    # Define size variables
    inp_h, inp_w = data.shape[2], data.shape[3]
    out_h, out_w = int(inp_h * rescale_factor), int(inp_w * rescale_factor)

    if polys_gt_inds.shape[0] > 0:
        # class label for the mask
        gt_class_labels = roidb['gt_classes'][polys_gt_inds]
        semantic_segms = blob_utils.zeros((num_cls, out_h, out_w), int32=True)
        # narrow scale and size
        scale = im_scale * rescale_factor
        im_h, im_w = roidb['height'], roidb['width']
        im_label_h, im_label_w = int(im_h * scale), int(im_w * scale)

        # add
        for i in range(polys_gt_inds.shape[0]):
            cls_label = gt_class_labels[i]
            poly_gt = polys_gt[i]
            # Rasterize the portion of the polygon mask within the given fg roi
            # to an im_label_h x im_label_w binary image
            mask = segm_utils.polys_to_mask_scaled(poly_gt, im_h, im_w, scale)
            mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
            semantic_segms[cls_label, 0:im_label_h, 0:im_label_w] = np.maximum(
                semantic_segms[cls_label, 0:im_label_h, 0:im_label_w],
                mask,
                dtype=np.int32)

        semantic_segms = np.reshape(semantic_segms,
                                    (1, num_cls * out_h * out_w))

    else:
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).

        # We give it an -1's blob (ignore label)
        semantic_segms = -blob_utils.ones(
            (1, num_cls * out_h * out_w), int32=True)

    blobs['semantic_segms_int32'] = semantic_segms
    blobs['img_rois'] = np.array([batch_idx, 0, 0, inp_w - 1, inp_h - 1],
                                 dtype=np.float32)[np.newaxis, :]
Exemple #7
0
def add_classification_blobs(blobs, im_scales, roidb):
    """Add blobs needed for training classification models."""
    # Sample training RoIs from each image and append them to the blob lists
    for im_i, entry in enumerate(roidb):
        blobs['rois'].append(im_i * blob_utils.ones(
            (entry['gt_classes'].shape[0], 1)))
        blobs['labels_int32'].append(entry['gt_classes'].astype(np.int32))

    # Concat the training blob lists into tensors
    for k, v in blobs.items():
        if isinstance(v, list) and len(v) > 0:
            blobs[k] = np.concatenate(v)
    valid = True

    return valid
Exemple #8
0
def _expand_to_class_specific_mask_targets(masks, mask_class_labels):
    """Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
    to encode class specific mask targets.
    """
    assert masks.shape[0] == mask_class_labels.shape[0]
    M = cfg.MRCNN.RESOLUTION

    # Target values of -1 are "don't care" / ignore labels
    mask_targets = -blob_utils.ones(
        (masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True)

    for i in range(masks.shape[0]):
        cls = int(mask_class_labels[i])
        start = M**2 * cls
        end = start + M**2
        # Ignore background instance
        # (only happens when there is no fg samples in an image)
        if cls > 0:
            mask_targets[i, start:end] = masks[i, :]

    return mask_targets
Exemple #9
0
def _expand_to_class_specific_mask_targets(masks, mask_class_labels):
    """Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
    to encode class specific mask targets.
    """
    assert masks.shape[0] == mask_class_labels.shape[0]
    M = cfg.MRCNN.RESOLUTION

    # Target values of -1 are "don't care" / ignore labels
    mask_targets = -blob_utils.ones(
        (masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True)

    for i in range(masks.shape[0]):
        cls = int(mask_class_labels[i])
        start = M**2 * cls
        end = start + M**2
        # Ignore background instance
        # (only happens when there is no fg samples in an image)
        if cls > 0:
            mask_targets[i, start:end] = masks[i, :]

    return mask_targets
Exemple #10
0
def _sample_rois(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) # 512
    
    rois_per_this_image = np.minimum(rois_per_image, len(roidb['boxes']))
    keep_inds = npr.choice(range(len(roidb['boxes'])), size=rois_per_this_image, replace=False)
    keep_inds = keep_inds.astype(np.int32)
    sampled_boxes = roidb['boxes'][keep_inds] # (512, 4)
    
    # sampled_boxes = roidb['boxes'][:512]

    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    blob_dict = dict(rois=sampled_rois)

    return blob_dict
Exemple #11
0
def _expand_to_class_specific_boundary_targets(boundarys,
                                               boundary_class_labels):
    """Expand boundarys from shape (#boundarys, M ** 2) to (#boundarys, #classes * M ** 2)
    to encode class specific boundary targets.
    """
    assert boundarys.shape[0] == boundary_class_labels.shape[0]
    M = cfg.BOUNDARY.RESOLUTION

    # Target values of -1 are "don't care" / ignore labels
    boundary_targets = -blob_utils.ones(
        (boundarys.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True)

    for i in range(boundarys.shape[0]):
        cls = int(boundary_class_labels[i])
        start = M**2 * cls
        end = start + M**2
        # Ignore background instance
        # (only happens when there is no fg samples in an image)
        if cls > 0:
            boundary_targets[i, start:end] = boundarys[i, :]

    return boundary_targets
Exemple #12
0
def _get_gt_rois(roidb, im_scale, batch_idx):
    """Get ground truth rois, and the corresponding labels.
    """
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    # just get all the labels
    sampled_labels = roidb['gt_classes'][gt_inds]
    sampled_boxes = roidb['boxes'][gt_inds, :]

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))
    im_height = np.round(roidb['height'] * im_scale)
    im_width = np.round(roidb['width'] * im_scale)
    im_info = np.array([[im_height, im_width, im_scale]], dtype=np.float32)

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     im_info=im_info)

    return blob_dict
Exemple #13
0
def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
    """Add Mask R-CNN specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    M = cfg.MRCNN.RESOLUTION
    polys_gt_inds = np.where((roidb['gt_classes'] > 0) &
                             (roidb['is_crowd'] == 0))[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    # boxes_from_polys = [roidb['boxes'][i] for i in polys_gt_inds]
    fg_inds = np.where(blobs['labels_int32'] > 0)[0]
    roi_has_mask = blobs['labels_int32'].copy()
    roi_has_mask[roi_has_mask > 0] = 1

    if fg_inds.shape[0] > 0:
        # Class labels for the foreground rois
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)

        # Find overlap between all foreground rois and the bounding boxes
        # enclosing each segmentation
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False))
        # Map from each fg rois to the index of the mask with highest overlap
        # (measured by bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # add fg targets
        for i in range(rois_fg.shape[0]):
            fg_polys_ind = fg_polys_inds[i]
            poly_gt = polys_gt[fg_polys_ind]
            roi_fg = rois_fg[i]
            # Rasterize the portion of the polygon mask within the given fg roi
            # to an M x M binary image
            mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M)
            mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
            masks[i, :] = np.reshape(mask, M**2)
    else:  # If there are no fg masks (it does happen)
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # rois_fg is actually one background roi, but that's ok because ...
        rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # We give it an -1's blob (ignore label)
        masks = -blob_utils.ones((1, M**2), int32=True)
        # We label it with class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # Mark that the first roi has a mask
        roi_has_mask[0] = 1

    if cfg.MRCNN.CLS_SPECIFIC_MASK:
        masks = _expand_to_class_specific_mask_targets(masks,
                                                       mask_class_labels)

    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    # Update blobs dict with Mask R-CNN blobs
    blobs['mask_rois'] = rois_fg
    blobs['roi_has_mask_int32'] = roi_has_mask
    blobs['masks_int32'] = masks
Exemple #14
0
def add_charmask_rcnn_blobs(blobs, sampled_boxes, gt_boxes, gt_inds, roidb, im_scale, batch_idx):
    """Add Mask R-CNN specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    is_e2e = cfg.MRCNN.IS_E2E
    M_HEIGHT = cfg.MRCNN.RESOLUTION_H
    M_WIDTH = cfg.MRCNN.RESOLUTION_W
    mask_rois_per_this_image = cfg.MRCNN.MASK_BATCH_SIZE_PER_IM
    polys_gt_inds = np.where(
        (roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0)
    )[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    chars_gts = roidb['charboxes']
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    if DEBUG:
        img_path = roidb['image']
        img = Image.open(img_path)
        # img = blobs['data'][0]
        # img = img.transpose((1,2,0))
        # img  += cfg.PIXEL_MEANS
        # img = img.astype(np.int8)
        # img = Image.fromarray(img)

    if is_e2e:
        fg_inds = np.where(blobs['labels_int32'] > 0)[0]
        if fg_inds.size > mask_rois_per_this_image:
            fg_inds = npr.choice(
                fg_inds, size=mask_rois_per_this_image, replace=False
            )
        roi_has_mask = np.ones((fg_inds.shape[0], ), dtype=np.int32)

        if fg_inds.shape[0] > 0:
            # Class labels for the foreground rois
            mask_class_labels = blobs['labels_int32'][fg_inds]
            masks = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True)
            mask_weights = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH), dtype=np.float32)
            char_boxes = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)
            char_boxes_inside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)
            char_boxes_outside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)

            # Find overlap between all foreground rois and the bounding boxes
            # enclosing each segmentation
            rois_fg = sampled_boxes[fg_inds]
            overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
                rois_fg.astype(np.float32, copy=False),
                boxes_from_polys.astype(np.float32, copy=False)
            )
            # Map from each fg rois to the index of the mask with highest overlap
            # (measured by bbox overlap)
            fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

            # add fg targets
            for i in range(rois_fg.shape[0]):
                fg_polys_ind = fg_polys_inds[i]
                poly_gt = polys_gt[fg_polys_ind]
                indexes_rec_rois_gt_chars = np.where(chars_gts[:, 9] == fg_polys_ind)
                chars_gt = chars_gts[indexes_rec_rois_gt_chars, :9]
                roi_fg = rois_fg[i]
                # Rasterize the portion of the polygon mask within the given fg roi
                # to an M_HEIGHT x M_WIDTH binary image
                mask, mask_weight, char_box, char_box_inside_weight = segm_utils.polys_to_mask_wrt_box_rec(chars_gt.copy(), poly_gt, roi_fg.copy(), M_HEIGHT, M_WIDTH, weight_wh=cfg.MRCNN.WEIGHT_WH)
                if DEBUG:
                    draw = ImageDraw.Draw(img)
                    draw.rectangle([(roi_fg[0],roi_fg[1]), (roi_fg[2],roi_fg[3])])
                    img.save('./tests/image.jpg')
                    _visu_global_map(mask[0,:,:].copy(), './tests/proposals_visu_global.jpg')
                    _visu_char_map(mask[1,:,:].copy(), './tests/proposals_visu_char.jpg')
                    _visu_char_box(char_box, char_box_inside_weight, './tests/char_box.jpg', M_HEIGHT, M_WIDTH)
                masks[i, 0, :] = np.reshape(mask[0,:,:], M_HEIGHT*M_WIDTH)
                masks[i, 1, :] = np.reshape(mask[1,:,:], M_HEIGHT*M_WIDTH)
                mask_weights[i, :] = np.reshape(mask_weight, M_HEIGHT*M_WIDTH)
                char_boxes[i, :, :] = np.reshape(char_box, (M_HEIGHT*M_WIDTH, 4))
                char_boxes_inside_weight[i, :, :] = np.reshape(char_box_inside_weight, (M_HEIGHT*M_WIDTH, 4))
        else:  # If there are no fg masks (it does happen)
            # The network cannot handle empty blobs, so we must provide a mask
            # We simply take the first bg roi, given it an all -1's mask (ignore
            # label), and label it with class zero (bg).
            bg_inds = np.where(blobs['labels_int32'] == 0)[0]
            # rois_fg is actually one background roi, but that's ok because ...
            rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
            # We give it an -1's blob (ignore label)
            masks = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True)
            mask_weights = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True)
            char_boxes_inside_weight = np.zeros(1, M_HEIGHT*M_WIDTH, 4, dtype=np.float32)
            # We label it with class = 0 (background)
            mask_class_labels = blob_utils.zeros((1, ))
            # Mark that the first roi has a mask
            roi_has_mask[0] = 1
    else:
        fg_inds = gt_inds
        roi_has_mask = np.ones((fg_inds.shape[0], ), dtype=np.int32)

        if fg_inds.shape[0] > 0:
            # Class labels for the foreground rois
            mask_class_labels = np.ones((fg_inds.shape[0], ), dtype=np.int32)
            masks = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True)
            char_boxes = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)
            char_boxes_inside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)
            char_boxes_outside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32)
            # mask_weights = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True)

            rois_fg = gt_boxes
            # print(gt_boxes.shape[0])
            # add fg targets
            for i in range(rois_fg.shape[0]):
                fg_polys_ind = fg_inds[i]
                poly_gt = polys_gt[fg_polys_ind]
                indexes_rec_rois_gt_chars = np.where(chars_gts[:, 9] == fg_polys_ind)
                chars_gt = chars_gts[indexes_rec_rois_gt_chars, :9]
                roi_fg = rois_fg[i]
                # Rasterize the portion of the polygon mask within the given fg roi
                # to an M_HEIGHT x M_WIDTH binary image
                mask, char_box, char_box_inside_weight = segm_utils.polys_to_mask_wrt_box_rec(chars_gt, poly_gt, roi_fg, M_HEIGHT, M_WIDTH, weight_wh=cfg.MRCNN.WEIGHT_WH)
                if DEBUG:
                    _visu_char_box(char_box, char_box_inside_weight, './tests/char_box.jpg', M_HEIGHT, M_WIDTH)
                mask = np.array(mask, dtype=np.int32)  # Ensure it's binary
                # mask_weight = np.array(mask_weight, dtype=np.int32)  # Ensure it's binary
                masks[i, 0, :] = np.reshape(mask[0,:,:], M_HEIGHT*M_WIDTH)
                masks[i, 1, :] = np.reshape(mask[1,:,:], M_HEIGHT*M_WIDTH)
                char_boxes[i, :, :] = np.reshape(char_box, (M_HEIGHT*M_WIDTH, 4))
                char_boxes_inside_weight[i, :, :] = np.reshape(char_box_inside_weight, (M_HEIGHT*M_WIDTH, 4))
        else:  # If there are no fg masks (it does happen)
            # The network cannot handle empty blobs, so we must provide a mask
            # We simply take the first bg roi, given it an all -1's mask (ignore
            # label), and label it with class zero (bg).
            bg_inds = np.where(blobs['labels_int32'] == 0)[0]
            # rois_fg is actually one background roi, but that's ok because ...
            rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
            # We give it an -1's blob (ignore label)
            masks = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True)
            mask_weights = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True)
            char_boxes = -np.ones(1, M_HEIGHT*M_WIDTH, 4, dtype=np.int32)
            char_boxes_inside_weight = -np.zeros(1, M_HEIGHT*M_WIDTH, 4, dtype=np.float32)
            # We label it with class = 0 (background)
            mask_class_labels = blob_utils.zeros((1, ))
            # Mark that the first roi has a mask
            roi_has_mask[0] = 1


    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    char_boxes_outside_weight = np.array(
        char_boxes_inside_weight > 0, dtype=char_boxes_inside_weight.dtype
    )

    # Update blobs dict with Mask R-CNN blobs
    blobs['mask_rois'] = rois_fg
    blobs['roi_has_mask_int32'] = roi_has_mask
    blobs['masks_global_int32'] = masks[:, 0, :]
    blobs['masks_char_int32'] = masks[:, 1, :].reshape((-1, M_HEIGHT, M_WIDTH))
    blobs['masks_char_weight'] = mask_weights
    blobs['char_bbox_targets'] = char_boxes.reshape((-1,4))
    blobs['char_bbox_inside_weights'] = char_boxes_inside_weight.reshape((-1,4))
    blobs['char_bbox_outside_weights'] = char_boxes_outside_weight.reshape((-1,4))
Exemple #15
0
def _sample_pairs(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    fg_pairs_per_image = cfg.TRAIN.FG_REL_SIZE_PER_IM
    pairs_per_image = int(
        cfg.TRAIN.FG_REL_SIZE_PER_IM /
        cfg.TRAIN.FG_REL_FRACTION)  # need much more pairs since it's quadratic
    max_pair_overlaps = roidb['max_pair_overlaps']

    gt_pair_inds = np.where(max_pair_overlaps > 1.0 - 1e-4)[0]
    fg_pair_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH)
                            & (max_pair_overlaps <= 1.0 - 1e-4))[0]

    fg_pairs_per_this_image = np.minimum(fg_pairs_per_image,
                                         gt_pair_inds.size + fg_pair_inds.size)
    # Sample foreground regions without replacement
    if fg_pair_inds.size > 0:
        fg_pair_inds = npr.choice(fg_pair_inds,
                                  size=(fg_pairs_per_this_image -
                                        gt_pair_inds.size),
                                  replace=False)
    fg_pair_inds = np.append(fg_pair_inds, gt_pair_inds)

    # Label is the class each RoI has max overlap with
    fg_prd_labels = roidb['max_prd_classes'][fg_pair_inds]
    blob_dict = dict(
        fg_prd_labels_int32=fg_prd_labels.astype(np.int32, copy=False))
    if cfg.MODEL.USE_BG:
        bg_pair_inds = np.where(
            (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]

        # Compute number of background RoIs to take from this image (guarding
        # against there being fewer than desired)
        bg_pairs_per_this_image = pairs_per_image - fg_pairs_per_this_image
        bg_pairs_per_this_image = np.minimum(bg_pairs_per_this_image,
                                             bg_pair_inds.size)
        # Sample foreground regions without replacement
        if bg_pair_inds.size > 0:
            bg_pair_inds = npr.choice(bg_pair_inds,
                                      size=bg_pairs_per_this_image,
                                      replace=False)
        keep_pair_inds = np.append(fg_pair_inds, bg_pair_inds)
        all_prd_labels = np.zeros(keep_pair_inds.size, dtype=np.int32)
        all_prd_labels[:fg_pair_inds.
                       size] = fg_prd_labels + 1  # class should start from 1
    else:
        keep_pair_inds = fg_pair_inds
        all_prd_labels = fg_prd_labels
    blob_dict['all_prd_labels_int32'] = all_prd_labels.astype(np.int32,
                                                              copy=False)
    blob_dict['fg_size'] = np.array(
        [fg_pair_inds.size], dtype=np.int32
    )  # this is used to check if there is at least one fg to learn

    sampled_sbj_boxes = roidb['sbj_boxes'][keep_pair_inds]
    sampled_obj_boxes = roidb['obj_boxes'][keep_pair_inds]
    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_sbj_rois = sampled_sbj_boxes * im_scale
    sampled_obj_rois = sampled_obj_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (keep_pair_inds.shape[0], 1))
    sampled_sbj_rois = np.hstack((repeated_batch_idx, sampled_sbj_rois))
    sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois))
    blob_dict['sbj_rois'] = sampled_sbj_rois
    blob_dict['obj_rois'] = sampled_obj_rois
    sampled_rel_rois = box_utils_rel.rois_union(sampled_sbj_rois,
                                                sampled_obj_rois)
    blob_dict['rel_rois'] = sampled_rel_rois
    if cfg.MODEL.USE_SPATIAL_FEAT:
        sampled_spt_feat = box_utils_rel.get_spt_features(
            sampled_sbj_boxes, sampled_obj_boxes, roidb['width'],
            roidb['height'])
        blob_dict['spt_feat'] = sampled_spt_feat
    if cfg.MODEL.USE_FREQ_BIAS:
        sbj_labels = roidb['max_sbj_classes'][keep_pair_inds]
        obj_labels = roidb['max_obj_classes'][keep_pair_inds]
        blob_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32,
                                                              copy=False)
        blob_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32,
                                                              copy=False)
    if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS:
        nodes_per_image = cfg.MODEL.NODE_SAMPLE_SIZE
        max_sbj_overlaps = roidb['max_sbj_overlaps']
        max_obj_overlaps = roidb['max_obj_overlaps']
        # sbj
        # Here a naturally existing assumption is, each positive sbj should have at least one positive obj
        sbj_pos_pair_pos_inds = np.where(
            (max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0]
        sbj_pos_obj_pos_pair_neg_inds = np.where(
            (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_obj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]
        sbj_pos_obj_neg_pair_neg_inds = np.where(
            (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_obj_overlaps < cfg.TRAIN.FG_THRESH)
            & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]
        if sbj_pos_pair_pos_inds.size > 0:
            sbj_pos_pair_pos_inds = npr.choice(
                sbj_pos_pair_pos_inds,
                size=int(min(nodes_per_image, sbj_pos_pair_pos_inds.size)),
                replace=False)
        if sbj_pos_obj_pos_pair_neg_inds.size > 0:
            sbj_pos_obj_pos_pair_neg_inds = npr.choice(
                sbj_pos_obj_pos_pair_neg_inds,
                size=int(
                    min(nodes_per_image, sbj_pos_obj_pos_pair_neg_inds.size)),
                replace=False)
        sbj_pos_pair_neg_inds = sbj_pos_obj_pos_pair_neg_inds
        if nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size > 0 and sbj_pos_obj_neg_pair_neg_inds.size > 0:
            sbj_pos_obj_neg_pair_neg_inds = npr.choice(
                sbj_pos_obj_neg_pair_neg_inds,
                size=int(
                    min(nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size,
                        sbj_pos_obj_neg_pair_neg_inds.size)),
                replace=False)
            sbj_pos_pair_neg_inds = np.append(sbj_pos_pair_neg_inds,
                                              sbj_pos_obj_neg_pair_neg_inds)
        sbj_pos_inds = np.append(sbj_pos_pair_pos_inds, sbj_pos_pair_neg_inds)
        binary_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32)
        binary_labels_sbj_pos[:sbj_pos_pair_pos_inds.size] = 1
        blob_dict[
            'binary_labels_sbj_pos_int32'] = binary_labels_sbj_pos.astype(
                np.int32, copy=False)
        prd_pos_labels_sbj_pos = roidb['max_prd_classes'][
            sbj_pos_pair_pos_inds]
        prd_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32)
        prd_labels_sbj_pos[:sbj_pos_pair_pos_inds.
                           size] = prd_pos_labels_sbj_pos + 1
        blob_dict['prd_labels_sbj_pos_int32'] = prd_labels_sbj_pos.astype(
            np.int32, copy=False)
        sbj_labels_sbj_pos = roidb['max_sbj_classes'][sbj_pos_inds] + 1
        # 1. set all obj labels > 0
        obj_labels_sbj_pos = roidb['max_obj_classes'][sbj_pos_inds] + 1
        # 2. find those negative obj
        max_obj_overlaps_sbj_pos = roidb['max_obj_overlaps'][sbj_pos_inds]
        obj_neg_inds_sbj_pos = np.where(
            max_obj_overlaps_sbj_pos < cfg.TRAIN.FG_THRESH)[0]
        obj_labels_sbj_pos[obj_neg_inds_sbj_pos] = 0
        blob_dict['sbj_labels_sbj_pos_int32'] = sbj_labels_sbj_pos.astype(
            np.int32, copy=False)
        blob_dict['obj_labels_sbj_pos_int32'] = obj_labels_sbj_pos.astype(
            np.int32, copy=False)
        # this is for freq bias in RelDN
        blob_dict['sbj_labels_sbj_pos_fg_int32'] = roidb['max_sbj_classes'][
            sbj_pos_inds].astype(np.int32, copy=False)
        blob_dict['obj_labels_sbj_pos_fg_int32'] = roidb['max_obj_classes'][
            sbj_pos_inds].astype(np.int32, copy=False)

        sampled_sbj_boxes_sbj_pos = roidb['sbj_boxes'][sbj_pos_inds]
        sampled_obj_boxes_sbj_pos = roidb['obj_boxes'][sbj_pos_inds]
        # Scale rois and format as (batch_idx, x1, y1, x2, y2)
        sampled_sbj_rois_sbj_pos = sampled_sbj_boxes_sbj_pos * im_scale
        sampled_obj_rois_sbj_pos = sampled_obj_boxes_sbj_pos * im_scale
        repeated_batch_idx = batch_idx * blob_utils.ones(
            (sbj_pos_inds.shape[0], 1))
        sampled_sbj_rois_sbj_pos = np.hstack(
            (repeated_batch_idx, sampled_sbj_rois_sbj_pos))
        sampled_obj_rois_sbj_pos = np.hstack(
            (repeated_batch_idx, sampled_obj_rois_sbj_pos))
        blob_dict['sbj_rois_sbj_pos'] = sampled_sbj_rois_sbj_pos
        blob_dict['obj_rois_sbj_pos'] = sampled_obj_rois_sbj_pos
        sampled_rel_rois_sbj_pos = box_utils_rel.rois_union(
            sampled_sbj_rois_sbj_pos, sampled_obj_rois_sbj_pos)
        blob_dict['rel_rois_sbj_pos'] = sampled_rel_rois_sbj_pos
        _, inds_unique_sbj_pos, inds_reverse_sbj_pos = np.unique(
            sampled_sbj_rois_sbj_pos,
            return_index=True,
            return_inverse=True,
            axis=0)
        assert inds_reverse_sbj_pos.shape[0] == sampled_sbj_rois_sbj_pos.shape[
            0]
        blob_dict['inds_unique_sbj_pos'] = inds_unique_sbj_pos
        blob_dict['inds_reverse_sbj_pos'] = inds_reverse_sbj_pos
        if cfg.MODEL.USE_SPATIAL_FEAT:
            sampled_spt_feat_sbj_pos = box_utils_rel.get_spt_features(
                sampled_sbj_boxes_sbj_pos, sampled_obj_boxes_sbj_pos,
                roidb['width'], roidb['height'])
            blob_dict['spt_feat_sbj_pos'] = sampled_spt_feat_sbj_pos
        # obj
        # Here a naturally existing assumption is, each positive obj should have at least one positive sbj
        obj_pos_pair_pos_inds = np.where(
            (max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0]
        obj_pos_sbj_pos_pair_neg_inds = np.where(
            (max_obj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]
        obj_pos_sbj_neg_pair_neg_inds = np.where(
            (max_obj_overlaps >= cfg.TRAIN.FG_THRESH)
            & (max_sbj_overlaps < cfg.TRAIN.FG_THRESH)
            & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]
        if obj_pos_pair_pos_inds.size > 0:
            obj_pos_pair_pos_inds = npr.choice(
                obj_pos_pair_pos_inds,
                size=int(min(nodes_per_image, obj_pos_pair_pos_inds.size)),
                replace=False)
        if obj_pos_sbj_pos_pair_neg_inds.size > 0:
            obj_pos_sbj_pos_pair_neg_inds = npr.choice(
                obj_pos_sbj_pos_pair_neg_inds,
                size=int(
                    min(nodes_per_image, obj_pos_sbj_pos_pair_neg_inds.size)),
                replace=False)
        obj_pos_pair_neg_inds = obj_pos_sbj_pos_pair_neg_inds
        if nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size > 0 and obj_pos_sbj_neg_pair_neg_inds.size:
            obj_pos_sbj_neg_pair_neg_inds = npr.choice(
                obj_pos_sbj_neg_pair_neg_inds,
                size=int(
                    min(nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size,
                        obj_pos_sbj_neg_pair_neg_inds.size)),
                replace=False)
            obj_pos_pair_neg_inds = np.append(obj_pos_pair_neg_inds,
                                              obj_pos_sbj_neg_pair_neg_inds)
        obj_pos_inds = np.append(obj_pos_pair_pos_inds, obj_pos_pair_neg_inds)
        binary_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32)
        binary_labels_obj_pos[:obj_pos_pair_pos_inds.size] = 1
        blob_dict[
            'binary_labels_obj_pos_int32'] = binary_labels_obj_pos.astype(
                np.int32, copy=False)
        prd_pos_labels_obj_pos = roidb['max_prd_classes'][
            obj_pos_pair_pos_inds]
        prd_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32)
        prd_labels_obj_pos[:obj_pos_pair_pos_inds.
                           size] = prd_pos_labels_obj_pos + 1
        blob_dict['prd_labels_obj_pos_int32'] = prd_labels_obj_pos.astype(
            np.int32, copy=False)
        obj_labels_obj_pos = roidb['max_obj_classes'][obj_pos_inds] + 1
        # 1. set all sbj labels > 0
        sbj_labels_obj_pos = roidb['max_sbj_classes'][obj_pos_inds] + 1
        # 2. find those negative sbj
        max_sbj_overlaps_obj_pos = roidb['max_sbj_overlaps'][obj_pos_inds]
        sbj_neg_inds_obj_pos = np.where(
            max_sbj_overlaps_obj_pos < cfg.TRAIN.FG_THRESH)[0]
        sbj_labels_obj_pos[sbj_neg_inds_obj_pos] = 0
        blob_dict['sbj_labels_obj_pos_int32'] = sbj_labels_obj_pos.astype(
            np.int32, copy=False)
        blob_dict['obj_labels_obj_pos_int32'] = obj_labels_obj_pos.astype(
            np.int32, copy=False)
        # this is for freq bias in RelDN
        blob_dict['sbj_labels_obj_pos_fg_int32'] = roidb['max_sbj_classes'][
            obj_pos_inds].astype(np.int32, copy=False)
        blob_dict['obj_labels_obj_pos_fg_int32'] = roidb['max_obj_classes'][
            obj_pos_inds].astype(np.int32, copy=False)

        sampled_sbj_boxes_obj_pos = roidb['sbj_boxes'][obj_pos_inds]
        sampled_obj_boxes_obj_pos = roidb['obj_boxes'][obj_pos_inds]
        # Scale rois and format as (batch_idx, x1, y1, x2, y2)
        sampled_sbj_rois_obj_pos = sampled_sbj_boxes_obj_pos * im_scale
        sampled_obj_rois_obj_pos = sampled_obj_boxes_obj_pos * im_scale
        repeated_batch_idx = batch_idx * blob_utils.ones(
            (obj_pos_inds.shape[0], 1))
        sampled_sbj_rois_obj_pos = np.hstack(
            (repeated_batch_idx, sampled_sbj_rois_obj_pos))
        sampled_obj_rois_obj_pos = np.hstack(
            (repeated_batch_idx, sampled_obj_rois_obj_pos))
        blob_dict['sbj_rois_obj_pos'] = sampled_sbj_rois_obj_pos
        blob_dict['obj_rois_obj_pos'] = sampled_obj_rois_obj_pos
        sampled_rel_rois_obj_pos = box_utils_rel.rois_union(
            sampled_sbj_rois_obj_pos, sampled_obj_rois_obj_pos)
        blob_dict['rel_rois_obj_pos'] = sampled_rel_rois_obj_pos
        _, inds_unique_obj_pos, inds_reverse_obj_pos = np.unique(
            sampled_obj_rois_obj_pos,
            return_index=True,
            return_inverse=True,
            axis=0)
        assert inds_reverse_obj_pos.shape[0] == sampled_obj_rois_obj_pos.shape[
            0]
        blob_dict['inds_unique_obj_pos'] = inds_unique_obj_pos
        blob_dict['inds_reverse_obj_pos'] = inds_reverse_obj_pos
        if cfg.MODEL.USE_SPATIAL_FEAT:
            sampled_spt_feat_obj_pos = box_utils_rel.get_spt_features(
                sampled_sbj_boxes_obj_pos, sampled_obj_boxes_obj_pos,
                roidb['width'], roidb['height'])
            blob_dict['spt_feat_obj_pos'] = sampled_spt_feat_obj_pos

    return blob_dict
    def _forward(self,
                 data,
                 im_info,
                 dataset_name=None,
                 roidb=None,
                 use_gt_labels=False,
                 include_feat=False,
                 **rpn_kwargs):
        im_data = data
        if self.training:
            roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb))
        if dataset_name is not None:
            dataset_name = blob_utils.deserialize(dataset_name)
        else:
            dataset_name = cfg.TRAIN.DATASETS[
                0] if self.training else cfg.TEST.DATASETS[
                    0]  # assuming only one dataset per run

        device_id = im_data.get_device()

        return_dict = {}  # A dict to collect return variables

        blob_conv = self.Conv_Body(im_data)
        blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data)

        if cfg.FPN.FPN_ON:
            # Retain only the blobs that will be used for RoI heads. `blob_conv` may include
            # extra blobs that are used for RPN proposals, but not for RoI heads.
            blob_conv = blob_conv[-self.num_roi_levels:]
            blob_conv_prd = blob_conv_prd[-self.num_roi_levels:]

        if not cfg.TRAIN.USE_GT_BOXES:
            rpn_ret = self.RPN(blob_conv, im_info, roidb)

            if cfg.MODEL.SHARE_RES5 and self.training:
                box_feat, res5_feat = self.Box_Head(blob_conv,
                                                    rpn_ret,
                                                    use_relu=True)
            else:
                box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True)
            cls_score, bbox_pred = self.Box_Outs(box_feat)

        # now go through the predicate branch
        use_relu = False if cfg.MODEL.NO_FC7_RELU else True
        if self.training:
            if cfg.TRAIN.USE_GT_BOXES:
                # we always feed one image per batch during training
                assert len(roidb) == 1
                im_scale = im_info.data.numpy()[:, 2][0]
                im_w = im_info.data.numpy()[:, 1][0]
                im_h = im_info.data.numpy()[:, 0][0]
                sbj_boxes = roidb[0]['sbj_gt_boxes']
                obj_boxes = roidb[0]['obj_gt_boxes']
                sbj_all_boxes = _augment_gt_boxes_by_perturbation(
                    sbj_boxes, im_w, im_h)
                obj_all_boxes = _augment_gt_boxes_by_perturbation(
                    obj_boxes, im_w, im_h)
                det_all_boxes = np.vstack((sbj_all_boxes, obj_all_boxes))
                det_all_boxes = np.unique(det_all_boxes, axis=0)
                det_all_rois = det_all_boxes * im_scale
                repeated_batch_idx = 0 * blob_utils.ones(
                    (det_all_rois.shape[0], 1))
                det_all_rois = np.hstack((repeated_batch_idx, det_all_rois))
                rel_ret = self.RelPN(det_all_rois, None, None, im_info,
                                     dataset_name, roidb)
            else:
                fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0]
                det_rois = rpn_ret['rois'][fg_inds]
                det_labels = rpn_ret['labels_int32'][fg_inds]
                det_scores = F.softmax(cls_score[fg_inds], dim=1)
                rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info,
                                     dataset_name, roidb)
            sbj_feat = self.Box_Head(blob_conv,
                                     rel_ret,
                                     rois_name='sbj_rois',
                                     use_relu=use_relu)
            obj_feat = self.Box_Head(blob_conv,
                                     rel_ret,
                                     rois_name='obj_rois',
                                     use_relu=use_relu)
        else:
            if roidb is not None:
                im_scale = im_info.data.numpy()[:, 2][0]
                im_w = im_info.data.numpy()[:, 1][0]
                im_h = im_info.data.numpy()[:, 0][0]
                sbj_boxes = roidb['sbj_gt_boxes']
                obj_boxes = roidb['obj_gt_boxes']
                sbj_rois = sbj_boxes * im_scale
                obj_rois = obj_boxes * im_scale
                repeated_batch_idx = 0 * blob_utils.ones(
                    (sbj_rois.shape[0], 1))
                sbj_rois = np.hstack((repeated_batch_idx, sbj_rois))
                obj_rois = np.hstack((repeated_batch_idx, obj_rois))
                rel_rois = box_utils.rois_union(sbj_rois, obj_rois)
                rel_ret = {}
                rel_ret['sbj_rois'] = sbj_rois
                rel_ret['obj_rois'] = obj_rois
                rel_ret['rel_rois'] = rel_rois
                if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
                    lvl_min = cfg.FPN.ROI_MIN_LEVEL
                    lvl_max = cfg.FPN.ROI_MAX_LEVEL
                    rois_blob_names = ['sbj_rois', 'obj_rois', 'rel_rois']
                    for rois_blob_name in rois_blob_names:
                        # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>
                        target_lvls = fpn_utils.map_rois_to_fpn_levels(
                            rel_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max)
                        fpn_utils.add_multilevel_roi_blobs(
                            rel_ret, rois_blob_name, rel_ret[rois_blob_name],
                            target_lvls, lvl_min, lvl_max)
                if use_gt_labels:
                    sbj_labels = roidb['sbj_gt_classes']  # start from 0
                    obj_labels = roidb['obj_gt_classes']  # start from 0
                    sbj_scores = np.ones_like(sbj_labels, dtype=np.float32)
                    obj_scores = np.ones_like(obj_labels, dtype=np.float32)
                else:
                    sbj_det_feat = self.Box_Head(blob_conv,
                                                 rel_ret,
                                                 rois_name='sbj_rois',
                                                 use_relu=True)
                    sbj_cls_scores, _ = self.Box_Outs(sbj_det_feat)
                    sbj_cls_scores = sbj_cls_scores.data.cpu().numpy()
                    obj_det_feat = self.Box_Head(blob_conv,
                                                 rel_ret,
                                                 rois_name='obj_rois',
                                                 use_relu=True)
                    obj_cls_scores, _ = self.Box_Outs(obj_det_feat)
                    obj_cls_scores = obj_cls_scores.data.cpu().numpy()
                    sbj_labels = np.argmax(sbj_cls_scores[:, 1:], axis=1)
                    obj_labels = np.argmax(obj_cls_scores[:, 1:], axis=1)
                    sbj_scores = np.amax(sbj_cls_scores[:, 1:], axis=1)
                    obj_scores = np.amax(obj_cls_scores[:, 1:], axis=1)
                rel_ret['sbj_scores'] = sbj_scores.astype(np.float32,
                                                          copy=False)
                rel_ret['obj_scores'] = obj_scores.astype(np.float32,
                                                          copy=False)
                rel_ret['sbj_labels'] = sbj_labels.astype(
                    np.int32, copy=False) + 1  # need to start from 1
                rel_ret['obj_labels'] = obj_labels.astype(
                    np.int32, copy=False) + 1  # need to start from 1
                rel_ret['all_sbj_labels_int32'] = sbj_labels.astype(np.int32,
                                                                    copy=False)
                rel_ret['all_obj_labels_int32'] = obj_labels.astype(np.int32,
                                                                    copy=False)
                sbj_feat = self.Box_Head(blob_conv,
                                         rel_ret,
                                         rois_name='sbj_rois',
                                         use_relu=use_relu)
                obj_feat = self.Box_Head(blob_conv,
                                         rel_ret,
                                         rois_name='obj_rois',
                                         use_relu=use_relu)
            else:
                score_thresh = cfg.TEST.SCORE_THRESH
                while score_thresh >= -1e-06:  # a negative value very close to 0.0
                    det_rois, det_labels, det_scores = \
                        self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                    rel_ret = self.RelPN(det_rois, det_labels, det_scores,
                                         im_info, dataset_name, roidb)
                    valid_len = len(rel_ret['rel_rois'])
                    if valid_len > 0:
                        break
                    logger.info(
                        'Got {} rel_rois when score_thresh={}, changing to {}'.
                        format(valid_len, score_thresh, score_thresh - 0.01))
                    score_thresh -= 0.01
                det_feat = self.Box_Head(blob_conv,
                                         rel_ret,
                                         rois_name='det_rois',
                                         use_relu=use_relu)
                sbj_feat = det_feat[rel_ret['sbj_inds']]
                obj_feat = det_feat[rel_ret['obj_inds']]

        rel_feat = self.Prd_RCNN.Box_Head(blob_conv_prd,
                                          rel_ret,
                                          rois_name='rel_rois',
                                          use_relu=use_relu)

        concat_feat = torch.cat((sbj_feat, rel_feat, obj_feat), dim=1)

        if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE or cfg.MODEL.USE_SEM_CONCAT:
            sbj_labels = rel_ret['all_sbj_labels_int32']
            obj_labels = rel_ret['all_obj_labels_int32']
        else:
            sbj_labels = None
            obj_labels = None

        # when MODEL.USE_SEM_CONCAT, memory runs out if the whole batch is fed once
        # so we need to feed the batch twice if it's big
        gn_size = 1000
        if cfg.MODEL.USE_SEM_CONCAT and concat_feat.shape[0] > gn_size:
            group = int(math.ceil(concat_feat.shape[0] / gn_size))
            prd_cls_scores = None
            sbj_cls_scores = None
            obj_cls_scores = None
            for i in range(group):
                end = int(min((i + 1) * gn_size, concat_feat.shape[0]))
                concat_feat_i = concat_feat[i * gn_size:end]
                sbj_labels_i = sbj_labels[
                    i * gn_size:end] if sbj_labels is not None else None
                obj_labels_i = obj_labels[
                    i * gn_size:end] if obj_labels is not None else None
                sbj_feat_i = sbj_feat[i * gn_size:end]
                obj_feat_i = obj_feat[i * gn_size:end]
                prd_cls_scores_i, sbj_cls_scores_i, obj_cls_scores_i = \
                    self.RelDN(concat_feat_i, sbj_labels_i, obj_labels_i, sbj_feat_i, obj_feat_i)
                if prd_cls_scores is None:
                    prd_cls_scores = prd_cls_scores_i
                    sbj_cls_scores = sbj_cls_scores_i
                    obj_cls_scores = obj_cls_scores_i
                else:
                    prd_cls_scores = torch.cat(
                        (prd_cls_scores, prd_cls_scores_i))
                    sbj_cls_scores = torch.cat(
                        (sbj_cls_scores, sbj_cls_scores_i
                         )) if sbj_cls_scores_i is not None else sbj_cls_scores
                    obj_cls_scores = torch.cat(
                        (obj_cls_scores, obj_cls_scores_i
                         )) if obj_cls_scores_i is not None else obj_cls_scores
        else:
            prd_cls_scores, sbj_cls_scores, obj_cls_scores = \
                    self.RelDN(concat_feat, sbj_labels, obj_labels, sbj_feat, obj_feat)

        if self.training:
            return_dict['losses'] = {}
            return_dict['metrics'] = {}
            if not cfg.TRAIN.USE_GT_BOXES:
                # rpn loss
                rpn_kwargs.update(
                    dict((k, rpn_ret[k]) for k in rpn_ret.keys()
                         if (k.startswith('rpn_cls_logits')
                             or k.startswith('rpn_bbox_pred'))))
                loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses(
                    **rpn_kwargs)
                if cfg.FPN.FPN_ON:
                    for i, lvl in enumerate(
                            range(cfg.FPN.RPN_MIN_LEVEL,
                                  cfg.FPN.RPN_MAX_LEVEL + 1)):
                        return_dict['losses']['loss_rpn_cls_fpn%d' %
                                              lvl] = loss_rpn_cls[i]
                        return_dict['losses']['loss_rpn_bbox_fpn%d' %
                                              lvl] = loss_rpn_bbox[i]
                else:
                    return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls
                    return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox
                # bbox loss
                loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses(
                    cls_score, bbox_pred, rpn_ret['labels_int32'],
                    rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'],
                    rpn_ret['bbox_outside_weights'])
                return_dict['losses']['loss_cls'] = loss_cls
                return_dict['losses']['loss_bbox'] = loss_bbox
                return_dict['metrics']['accuracy_cls'] = accuracy_cls
            loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses(
                prd_cls_scores,
                rel_ret['all_prd_labels_int32'],
                weight=self.prd_weights)
            return_dict['losses']['loss_cls_prd'] = loss_cls_prd
            return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd
            if cfg.MODEL.USE_SEPARATE_SO_SCORES:
                loss_cls_sbj, accuracy_cls_sbj = reldn_heads.reldn_losses(
                    sbj_cls_scores,
                    rel_ret['all_sbj_labels_int32'],
                    weight=self.obj_weights)
                return_dict['losses']['loss_cls_sbj'] = loss_cls_sbj
                return_dict['metrics']['accuracy_cls_sbj'] = accuracy_cls_sbj
                loss_cls_obj, accuracy_cls_obj = reldn_heads.reldn_losses(
                    obj_cls_scores,
                    rel_ret['all_obj_labels_int32'],
                    weight=self.obj_weights)
                return_dict['losses']['loss_cls_obj'] = loss_cls_obj
                return_dict['metrics']['accuracy_cls_obj'] = accuracy_cls_obj

            if cfg.TRAIN.HUBNESS:
                loss_hubness_prd = reldn_heads.add_hubness_loss(prd_cls_scores)
                loss_hubness_sbj = reldn_heads.add_hubness_loss(sbj_cls_scores)
                loss_hubness_obj = reldn_heads.add_hubness_loss(obj_cls_scores)
                return_dict['losses']['loss_hubness_prd'] = loss_hubness_prd
                return_dict['losses']['loss_hubness_sbj'] = loss_hubness_sbj
                return_dict['losses']['loss_hubness_obj'] = loss_hubness_obj

            # pytorch0.4 bug on gathering scalar(0-dim) tensors
            for k, v in return_dict['losses'].items():
                return_dict['losses'][k] = v.unsqueeze(0)
            for k, v in return_dict['metrics'].items():
                return_dict['metrics'][k] = v.unsqueeze(0)
        else:
            # Testing
            return_dict['sbj_rois'] = rel_ret['sbj_rois']
            return_dict['obj_rois'] = rel_ret['obj_rois']
            return_dict['sbj_labels'] = rel_ret['sbj_labels']
            return_dict['obj_labels'] = rel_ret['obj_labels']
            return_dict['sbj_scores'] = rel_ret['sbj_scores']
            return_dict['sbj_scores_out'] = sbj_cls_scores
            return_dict['obj_scores'] = rel_ret['obj_scores']
            return_dict['obj_scores_out'] = obj_cls_scores
            return_dict['prd_scores'] = prd_cls_scores
            if include_feat:
                return_dict['sbj_feat'] = sbj_feat
                return_dict['obj_feat'] = obj_feat
                return_dict['prd_feat'] = concat_feat

        return return_dict
    def _forward(self,
                 data,
                 im_info,
                 do_vis=False,
                 dataset_name=None,
                 roidb=None,
                 use_gt_labels=False,
                 **rpn_kwargs):
        im_data = data
        if self.training:
            roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb))
        if dataset_name is not None:
            dataset_name = blob_utils.deserialize(dataset_name)
        else:
            dataset_name = cfg.TRAIN.DATASETS[
                0] if self.training else cfg.TEST.DATASETS[
                    0]  # assuming only one dataset per run

        device_id = im_data.get_device()

        return_dict = {}  # A dict to collect return variables

        blob_conv = self.Conv_Body(im_data)
        if not cfg.MODEL.USE_REL_PYRAMID:
            blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data)

        rpn_ret = self.RPN(blob_conv, im_info, roidb)

        if cfg.FPN.FPN_ON:
            # Retain only the blobs that will be used for RoI heads. `blob_conv` may include
            # extra blobs that are used for RPN proposals, but not for RoI heads.
            blob_conv = blob_conv[-self.num_roi_levels:]
            if not cfg.MODEL.USE_REL_PYRAMID:
                blob_conv_prd = blob_conv_prd[-self.num_roi_levels:]
            else:
                blob_conv_prd = self.RelPyramid(blob_conv)

        if cfg.MODEL.SHARE_RES5 and self.training:
            box_feat, res5_feat = self.Box_Head(blob_conv,
                                                rpn_ret,
                                                use_relu=True)
        else:
            box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True)
        cls_score, bbox_pred = self.Box_Outs(box_feat)

        # now go through the predicate branch
        use_relu = False if cfg.MODEL.NO_FC7_RELU else True
        if self.training:
            fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0]
            det_rois = rpn_ret['rois'][fg_inds]
            det_labels = rpn_ret['labels_int32'][fg_inds]
            det_scores = F.softmax(cls_score[fg_inds], dim=1)
            rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info,
                                 dataset_name, roidb)
            if cfg.MODEL.ADD_SO_SCORES:
                sbj_feat = self.S_Head(blob_conv,
                                       rel_ret,
                                       rois_name='sbj_rois',
                                       use_relu=use_relu)
                obj_feat = self.O_Head(blob_conv,
                                       rel_ret,
                                       rois_name='obj_rois',
                                       use_relu=use_relu)
            else:
                sbj_feat = self.Box_Head(blob_conv,
                                         rel_ret,
                                         rois_name='sbj_rois',
                                         use_relu=use_relu)
                obj_feat = self.Box_Head(blob_conv,
                                         rel_ret,
                                         rois_name='obj_rois',
                                         use_relu=use_relu)
            if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS:
                if cfg.MODEL.ADD_SO_SCORES:
                    # sbj
                    sbj_feat_sbj_pos = self.S_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='sbj_rois_sbj_pos',
                        use_relu=use_relu)
                    obj_feat_sbj_pos = self.O_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='obj_rois_sbj_pos',
                        use_relu=use_relu)
                    # obj
                    sbj_feat_obj_pos = self.S_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='sbj_rois_obj_pos',
                        use_relu=use_relu)
                    obj_feat_obj_pos = self.O_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='obj_rois_obj_pos',
                        use_relu=use_relu)
                else:
                    # sbj
                    sbj_feat_sbj_pos = self.Box_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='sbj_rois_sbj_pos',
                        use_relu=use_relu)
                    obj_feat_sbj_pos = self.Box_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='obj_rois_sbj_pos',
                        use_relu=use_relu)
                    # obj
                    sbj_feat_obj_pos = self.Box_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='sbj_rois_obj_pos',
                        use_relu=use_relu)
                    obj_feat_obj_pos = self.Box_Head(
                        blob_conv,
                        rel_ret,
                        rois_name='obj_rois_obj_pos',
                        use_relu=use_relu)
        else:
            if roidb is not None:
                im_scale = im_info.data.numpy()[:, 2][0]
                im_w = im_info.data.numpy()[:, 1][0]
                im_h = im_info.data.numpy()[:, 0][0]
                sbj_boxes = roidb['sbj_gt_boxes']
                obj_boxes = roidb['obj_gt_boxes']
                sbj_rois = sbj_boxes * im_scale
                obj_rois = obj_boxes * im_scale
                repeated_batch_idx = 0 * blob_utils.ones(
                    (sbj_rois.shape[0], 1))
                sbj_rois = np.hstack((repeated_batch_idx, sbj_rois))
                obj_rois = np.hstack((repeated_batch_idx, obj_rois))
                rel_rois = box_utils_rel.rois_union(sbj_rois, obj_rois)
                rel_ret = {}
                rel_ret['sbj_rois'] = sbj_rois
                rel_ret['obj_rois'] = obj_rois
                rel_ret['rel_rois'] = rel_rois
                if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
                    lvl_min = cfg.FPN.ROI_MIN_LEVEL
                    lvl_max = cfg.FPN.ROI_MAX_LEVEL
                    rois_blob_names = ['sbj_rois', 'obj_rois', 'rel_rois']
                    for rois_blob_name in rois_blob_names:
                        # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>
                        target_lvls = fpn_utils.map_rois_to_fpn_levels(
                            rel_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max)
                        fpn_utils.add_multilevel_roi_blobs(
                            rel_ret, rois_blob_name, rel_ret[rois_blob_name],
                            target_lvls, lvl_min, lvl_max)
                sbj_det_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='sbj_rois',
                                             use_relu=True)
                sbj_cls_scores, _ = self.Box_Outs(sbj_det_feat)
                sbj_cls_scores = sbj_cls_scores.data.cpu().numpy()
                obj_det_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='obj_rois',
                                             use_relu=True)
                obj_cls_scores, _ = self.Box_Outs(obj_det_feat)
                obj_cls_scores = obj_cls_scores.data.cpu().numpy()
                if use_gt_labels:
                    sbj_labels = roidb['sbj_gt_classes']  # start from 0
                    obj_labels = roidb['obj_gt_classes']  # start from 0
                    sbj_scores = np.ones_like(sbj_labels, dtype=np.float32)
                    obj_scores = np.ones_like(obj_labels, dtype=np.float32)
                else:
                    sbj_labels = np.argmax(sbj_cls_scores[:, 1:], axis=1)
                    obj_labels = np.argmax(obj_cls_scores[:, 1:], axis=1)
                    sbj_scores = np.amax(sbj_cls_scores[:, 1:], axis=1)
                    obj_scores = np.amax(obj_cls_scores[:, 1:], axis=1)
                rel_ret['sbj_scores'] = sbj_scores.astype(np.float32,
                                                          copy=False)
                rel_ret['obj_scores'] = obj_scores.astype(np.float32,
                                                          copy=False)
                rel_ret['sbj_labels'] = sbj_labels.astype(
                    np.int32, copy=False) + 1  # need to start from 1
                rel_ret['obj_labels'] = obj_labels.astype(
                    np.int32, copy=False) + 1  # need to start from 1
                rel_ret['all_sbj_labels_int32'] = sbj_labels.astype(np.int32,
                                                                    copy=False)
                rel_ret['all_obj_labels_int32'] = obj_labels.astype(np.int32,
                                                                    copy=False)
                if cfg.MODEL.USE_SPATIAL_FEAT:
                    spt_feat = box_utils_rel.get_spt_features(
                        sbj_boxes, obj_boxes, im_w, im_h)
                    rel_ret['spt_feat'] = spt_feat
                if cfg.MODEL.ADD_SO_SCORES:
                    sbj_feat = self.S_Head(blob_conv,
                                           rel_ret,
                                           rois_name='sbj_rois',
                                           use_relu=use_relu)
                    obj_feat = self.O_Head(blob_conv,
                                           rel_ret,
                                           rois_name='obj_rois',
                                           use_relu=use_relu)
                else:
                    sbj_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='sbj_rois',
                                             use_relu=use_relu)
                    obj_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='obj_rois',
                                             use_relu=use_relu)
            else:
                score_thresh = cfg.TEST.SCORE_THRESH
                while score_thresh >= -1e-06:  # a negative value very close to 0.0
                    det_rois, det_labels, det_scores = \
                        self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                    rel_ret = self.RelPN(det_rois, det_labels, det_scores,
                                         im_info, dataset_name, roidb)
                    valid_len = len(rel_ret['rel_rois'])
                    if valid_len > 0:
                        break
                    logger.info(
                        'Got {} rel_rois when score_thresh={}, changing to {}'.
                        format(valid_len, score_thresh, score_thresh - 0.01))
                    score_thresh -= 0.01
                if cfg.MODEL.ADD_SO_SCORES:
                    det_s_feat = self.S_Head(blob_conv,
                                             rel_ret,
                                             rois_name='det_rois',
                                             use_relu=use_relu)
                    det_o_feat = self.O_Head(blob_conv,
                                             rel_ret,
                                             rois_name='det_rois',
                                             use_relu=use_relu)
                    sbj_feat = det_s_feat[rel_ret['sbj_inds']]
                    obj_feat = det_o_feat[rel_ret['obj_inds']]
                else:
                    det_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='det_rois',
                                             use_relu=use_relu)
                    sbj_feat = det_feat[rel_ret['sbj_inds']]
                    obj_feat = det_feat[rel_ret['obj_inds']]

        rel_feat = self.Prd_RCNN.Box_Head(blob_conv_prd,
                                          rel_ret,
                                          rois_name='rel_rois',
                                          use_relu=use_relu)

        spo_feat = torch.cat((sbj_feat, rel_feat, obj_feat), dim=1)
        if cfg.MODEL.USE_SPATIAL_FEAT:
            spt_feat = rel_ret['spt_feat']
        else:
            spt_feat = None
        if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE:
            sbj_labels = rel_ret['all_sbj_labels_int32']
            obj_labels = rel_ret['all_obj_labels_int32']
        else:
            sbj_labels = None
            obj_labels = None

        # prd_scores is the visual scores. See reldn_heads.py
        prd_scores, prd_bias_scores, prd_spt_scores, ttl_cls_scores, sbj_cls_scores, obj_cls_scores = \
            self.RelDN(spo_feat, spt_feat, sbj_labels, obj_labels, sbj_feat, obj_feat)

        if self.training:
            return_dict['losses'] = {}
            return_dict['metrics'] = {}
            # rpn loss
            rpn_kwargs.update(
                dict((k, rpn_ret[k]) for k in rpn_ret.keys()
                     if (k.startswith('rpn_cls_logits')
                         or k.startswith('rpn_bbox_pred'))))
            loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses(
                **rpn_kwargs)
            if cfg.FPN.FPN_ON:
                for i, lvl in enumerate(
                        range(cfg.FPN.RPN_MIN_LEVEL,
                              cfg.FPN.RPN_MAX_LEVEL + 1)):
                    return_dict['losses']['loss_rpn_cls_fpn%d' %
                                          lvl] = loss_rpn_cls[i]
                    return_dict['losses']['loss_rpn_bbox_fpn%d' %
                                          lvl] = loss_rpn_bbox[i]
            else:
                return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls
                return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox
            # bbox loss
            loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses(
                cls_score, bbox_pred, rpn_ret['labels_int32'],
                rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'],
                rpn_ret['bbox_outside_weights'])
            return_dict['losses']['loss_cls'] = loss_cls
            return_dict['losses']['loss_bbox'] = loss_bbox
            return_dict['metrics']['accuracy_cls'] = accuracy_cls

            if cfg.MODEL.USE_FREQ_BIAS and not cfg.MODEL.ADD_SCORES_ALL:
                loss_cls_bias, accuracy_cls_bias = reldn_heads.reldn_losses(
                    prd_bias_scores, rel_ret['all_prd_labels_int32'])
                return_dict['losses']['loss_cls_bias'] = loss_cls_bias
                return_dict['metrics']['accuracy_cls_bias'] = accuracy_cls_bias
            if cfg.MODEL.USE_SPATIAL_FEAT and not cfg.MODEL.ADD_SCORES_ALL:
                loss_cls_spt, accuracy_cls_spt = reldn_heads.reldn_losses(
                    prd_spt_scores, rel_ret['all_prd_labels_int32'])
                return_dict['losses']['loss_cls_spt'] = loss_cls_spt
                return_dict['metrics']['accuracy_cls_spt'] = accuracy_cls_spt
            if cfg.MODEL.ADD_SCORES_ALL:
                loss_cls_ttl, accuracy_cls_ttl = reldn_heads.reldn_losses(
                    ttl_cls_scores, rel_ret['all_prd_labels_int32'])
                return_dict['losses']['loss_cls_ttl'] = loss_cls_ttl
                return_dict['metrics']['accuracy_cls_ttl'] = accuracy_cls_ttl
            else:
                loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses(
                    prd_scores, rel_ret['all_prd_labels_int32'])
                return_dict['losses']['loss_cls_prd'] = loss_cls_prd
                return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd
            if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS:
                # sbj
                rel_feat_sbj_pos = self.Prd_RCNN.Box_Head(
                    blob_conv_prd,
                    rel_ret,
                    rois_name='rel_rois_sbj_pos',
                    use_relu=use_relu)
                spo_feat_sbj_pos = torch.cat(
                    (sbj_feat_sbj_pos, rel_feat_sbj_pos, obj_feat_sbj_pos),
                    dim=1)
                if cfg.MODEL.USE_SPATIAL_FEAT:
                    spt_feat_sbj_pos = rel_ret['spt_feat_sbj_pos']
                else:
                    spt_feat_sbj_pos = None
                if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE:
                    sbj_labels_sbj_pos_fg = rel_ret[
                        'sbj_labels_sbj_pos_fg_int32']
                    obj_labels_sbj_pos_fg = rel_ret[
                        'obj_labels_sbj_pos_fg_int32']
                else:
                    sbj_labels_sbj_pos_fg = None
                    obj_labels_sbj_pos_fg = None
                _, prd_bias_scores_sbj_pos, _, ttl_cls_scores_sbj_pos, _, _ = \
                    self.RelDN(spo_feat_sbj_pos, spt_feat_sbj_pos, sbj_labels_sbj_pos_fg, obj_labels_sbj_pos_fg, sbj_feat_sbj_pos, obj_feat_sbj_pos)
                # obj
                rel_feat_obj_pos = self.Prd_RCNN.Box_Head(
                    blob_conv_prd,
                    rel_ret,
                    rois_name='rel_rois_obj_pos',
                    use_relu=use_relu)
                spo_feat_obj_pos = torch.cat(
                    (sbj_feat_obj_pos, rel_feat_obj_pos, obj_feat_obj_pos),
                    dim=1)
                if cfg.MODEL.USE_SPATIAL_FEAT:
                    spt_feat_obj_pos = rel_ret['spt_feat_obj_pos']
                else:
                    spt_feat_obj_pos = None
                if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE:
                    sbj_labels_obj_pos_fg = rel_ret[
                        'sbj_labels_obj_pos_fg_int32']
                    obj_labels_obj_pos_fg = rel_ret[
                        'obj_labels_obj_pos_fg_int32']
                else:
                    sbj_labels_obj_pos_fg = None
                    obj_labels_obj_pos_fg = None
                _, prd_bias_scores_obj_pos, _, ttl_cls_scores_obj_pos, _, _ = \
                    self.RelDN(spo_feat_obj_pos, spt_feat_obj_pos, sbj_labels_obj_pos_fg, obj_labels_obj_pos_fg, sbj_feat_obj_pos, obj_feat_obj_pos)
                if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS:
                    loss_contrastive_sbj, loss_contrastive_obj = reldn_heads.reldn_contrastive_losses(
                        ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos,
                        rel_ret)
                    return_dict['losses'][
                        'loss_contrastive_sbj'] = loss_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_WEIGHT
                    return_dict['losses'][
                        'loss_contrastive_obj'] = loss_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_WEIGHT
                if cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS:
                    loss_so_contrastive_sbj, loss_so_contrastive_obj = reldn_heads.reldn_so_contrastive_losses(
                        ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos,
                        rel_ret)
                    return_dict['losses'][
                        'loss_so_contrastive_sbj'] = loss_so_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_SO_AWARE_WEIGHT
                    return_dict['losses'][
                        'loss_so_contrastive_obj'] = loss_so_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_SO_AWARE_WEIGHT
                if cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS:
                    loss_p_contrastive_sbj, loss_p_contrastive_obj = reldn_heads.reldn_p_contrastive_losses(
                        ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos,
                        prd_bias_scores_sbj_pos, prd_bias_scores_obj_pos,
                        rel_ret)
                    return_dict['losses'][
                        'loss_p_contrastive_sbj'] = loss_p_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_P_AWARE_WEIGHT
                    return_dict['losses'][
                        'loss_p_contrastive_obj'] = loss_p_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_P_AWARE_WEIGHT

            # pytorch0.4 bug on gathering scalar(0-dim) tensors
            for k, v in return_dict['losses'].items():
                return_dict['losses'][k] = v.unsqueeze(0)
            for k, v in return_dict['metrics'].items():
                return_dict['metrics'][k] = v.unsqueeze(0)
        else:
            # Testing
            return_dict['sbj_rois'] = rel_ret['sbj_rois']
            return_dict['obj_rois'] = rel_ret['obj_rois']
            return_dict['sbj_labels'] = rel_ret['sbj_labels']
            return_dict['obj_labels'] = rel_ret['obj_labels']
            return_dict['sbj_scores'] = rel_ret['sbj_scores']
            return_dict['obj_scores'] = rel_ret['obj_scores']
            return_dict['prd_scores'] = prd_scores
            if cfg.MODEL.USE_FREQ_BIAS:
                return_dict['prd_scores_bias'] = prd_bias_scores
            if cfg.MODEL.USE_SPATIAL_FEAT:
                return_dict['prd_scores_spt'] = prd_spt_scores
            if cfg.MODEL.ADD_SCORES_ALL:
                return_dict['prd_ttl_scores'] = ttl_cls_scores
            if do_vis:
                return_dict['blob_conv'] = blob_conv
                return_dict['blob_conv_prd'] = blob_conv_prd

        return return_dict
Exemple #18
0
    def _forward(self, data, im_info, do_vis=False, dataset_name=None, roidb=None, use_gt_labels=False, **rpn_kwargs):
        im_data = data
        if self.training:
            # if not isinstance(roidb[0], np.array):
            #     roidb = roidb[0]
            roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb)) # only support one gpu
        if dataset_name is not None:
            dataset_name = blob_utils.deserialize(dataset_name)
        else:
            dataset_name = cfg.TRAIN.DATASETS[0] if self.training else cfg.TEST.DATASETS[0]  # assuming only one dataset per run

        device_id = im_data.get_device()

        return_dict = {}  # A dict to collect return variables

        blob_conv = self.Conv_Body(im_data)
        # if not cfg.MODEL.USE_REL_PYRAMID:
        #     blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data)

        if self.training:
            gt_rois = np.empty((0, 5), dtype=np.float32)
            gt_classes = np.empty((0), dtype=np.int64)
            for i, r in enumerate(roidb):
                rois_i = r['boxes'] * im_info[i, 2]
                rois_i = np.hstack((i * blob_utils.ones((rois_i.shape[0], 1)), rois_i))
                gt_rois = np.append(gt_rois, rois_i, axis=0)
                gt_classes = np.append(gt_classes, r['gt_classes'], axis=0)

        if self.training or roidb is None:
            rpn_ret = self.RPN(blob_conv, im_info, roidb)




        if cfg.FPN.FPN_ON:
            # Retain only the blobs that will be used for RoI heads. `blob_conv` may include
            # extra blobs that are used for RPN proposals, but not for RoI heads.
            blob_conv = blob_conv[-self.num_roi_levels:]
            # if not cfg.MODEL.USE_REL_PYRAMID:
            #     blob_conv_prd = blob_conv_prd[-self.num_roi_levels:]
            # else:
            #     blob_conv_prd = self.RelPyramid(blob_conv)

        if self.training or roidb is None:
            if cfg.MODEL.SHARE_RES5 and self.training:
                box_feat, res5_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True)
            else:
                box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True)
            cls_score, bbox_pred = self.Box_Outs(box_feat)

        
        # now go through the predicate branch
        use_relu = False if cfg.MODEL.NO_FC7_RELU else True
        if self.training:
            score_thresh = cfg.TEST.SCORE_THRESH
            cls_score = F.softmax(cls_score, -1)
            while score_thresh >= -1e-06:  # a negative value very close to 0.0
                det_rois, det_labels, det_scores, det_dists, det_boxes_all = \
                    self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                real_area = (det_rois[:, 3] - det_rois[:, 1]) * (det_rois[:, 4] - det_rois[:, 2])
                non_zero_area_inds = np.where(real_area > 0)[0]
                det_rois = det_rois[non_zero_area_inds]
                det_labels = det_labels[non_zero_area_inds]
                det_scores = det_scores[non_zero_area_inds]
                det_dists = det_dists[non_zero_area_inds]
                det_boxes_all = det_boxes_all[non_zero_area_inds]
                # rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb)
                valid_len = len(det_rois)
                if valid_len > 0:
                    break
                logger.info('Got {} det_rois when score_thresh={}, changing to {}'.format(
                    valid_len, score_thresh, score_thresh - 0.01))
                score_thresh -= 0.01
            det_labels_gt = []
            ious = box_utils.bbox_overlaps(det_rois[:, 1:], gt_rois[:, 1:]) * \
                                          (det_rois[:, 0][:,None] == gt_rois[:, 0][None, :])
            det_labels_gt = gt_classes[ious.argmax(-1)]
            det_labels_gt[ious.max(-1) < cfg.TRAIN.FG_THRESH] = 0

        else:
            if roidb is not None:
                # raise FError('not support this mode!')
                # assert len(roidb) == 1
                im_scale = im_info.data.numpy()[:, 2][0]
                im_w = im_info.data.numpy()[:, 1][0]
                im_h = im_info.data.numpy()[:, 0][0]
                
                fpn_ret = {'gt_rois': gt_rois}
                if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
                    lvl_min = cfg.FPN.ROI_MIN_LEVEL
                    lvl_max = cfg.FPN.ROI_MAX_LEVEL
                    rois_blob_names = ['gt_rois']
                    for rois_blob_name in rois_blob_names:
                        # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl>
                        target_lvls = fpn_utils.map_rois_to_fpn_levels(
                            fpn_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max)
                        fpn_utils.add_multilevel_roi_blobs(
                            fpn_ret, rois_blob_name, fpn_ret[rois_blob_name], target_lvls,
                            lvl_min, lvl_max)
                det_feats = self.Box_Head(blob_conv, fpn_ret, rois_name='det_rois', use_relu=True)
                det_dists, _ = self.Box_Outs(det_feats)
                det_boxes_all = None
                if use_gt_labels:
                    det_labels_gt = gt_classes
                    det_labels = gt_classes
            else:

                score_thresh = cfg.TEST.SCORE_THRESH
                while score_thresh >= -1e-06:  # a negative value very close to 0.0
                    det_rois, det_labels, det_scores, det_dists, det_boxes_all = \
                        self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                    real_area = (det_rois[:, 3] - det_rois[:, 1]) * (det_rois[:, 4] - det_rois[:, 2])
                    non_zero_area_inds = np.where(real_area > 0)[0]
                    det_rois = det_rois[non_zero_area_inds]
                    det_labels = det_labels[non_zero_area_inds]
                    det_scores = det_scores[non_zero_area_inds]
                    det_dists = det_dists[non_zero_area_inds]
                    det_boxes_all = det_boxes_all[non_zero_area_inds]
                    # rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb)
                    valid_len = len(det_rois)
                    if valid_len > 0:
                        break
                    logger.info('Got {} det_rois when score_thresh={}, changing to {}'.format(
                        valid_len, score_thresh, score_thresh - 0.01))
                    score_thresh -= 0.01 


        return_dict['det_rois'] = det_rois
        num_rois = det_rois.shape[0]
        if not isinstance(det_dists, torch.Tensor):
            assert det_dists.shape[0] == num_rois
            det_dists = torch.from_numpy(det_dists).float().cuda(device_id)
        
        return_dict['det_dists'] = det_dists
        return_dict['det_scores'] = det_scores
        return_dict['blob_conv'] = blob_conv
        return_dict['det_boxes_all'] = det_boxes_all
        assert det_boxes_all.shape[0] == num_rois
        return_dict['det_labels'] = det_labels
        # return_dict['blob_conv_prd'] = blob_conv_prd

        if self.training or use_gt_labels:
            return_dict['det_labels_gt'] = det_labels_gt

        return return_dict
def _sample_rois(roidb, im_scale, batch_idx, stage=0):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # SNIP
    if cfg.FAST_RCNN.SNIP:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        for i in range(len(gt_inds)):
            gt_box = roidb['boxes'][gt_inds[i]]
            width = gt_box[2] - gt_box[0]
            height = gt_box[3] - gt_box[1]
            RES = np.sqrt(width * height) * im_scale
            box_to_gt_ind_map = roidb['box_to_gt_ind_map']
            if not (RES > cfg.FAST_RCNN.RES_LO and RES <= cfg.FAST_RCNN.RES_HI):
                ids = np.where(box_to_gt_ind_map == gt_inds[i])[0]
                for id in ids:
                    if max_overlaps[id] > cfg.FAST_RCNN.SNIP_NEG_THRESH:
                        # create an exception (neither fg/bg)
                        max_overlaps[id] = cfg.FAST_RCNN.SNIP_TARGET_THRESH
        for i in range(len(gt_inds)):
            max_overlaps[gt_inds[i]] = 1.0

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps > cfg.TRAIN.FG_THRESH + stage * 0.1)[0]
    # print('stage: {:d} num of fg_inds: {:d}'.format(int(stage), len(fg_inds)))
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(
            fg_inds, size=fg_rois_per_this_image, replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI + stage * 0.1) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(
            bg_inds, size=bg_rois_per_this_image, replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(
            sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels, stage)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets, stage)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :], stage)

    bbox_outside_weights = np.array(
        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(
        labels_int32=sampled_labels.astype(np.int32, copy=False),
        rois=sampled_rois,
        bbox_targets=bbox_targets,
        bbox_inside_weights=bbox_inside_weights,
        bbox_outside_weights=bbox_outside_weights)

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb,
                                               im_scale, batch_idx)

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
            blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx)

    return blob_dict
def _sample_rois(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds,
                             size=fg_rois_per_this_image,
                             replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI)
                       & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds,
                             size=bg_rois_per_this_image,
                             replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)

    if cfg.TEST.TAGGING or (cfg.MODEL.TAGGING):
        # Manually change keep_inds, so that the rois and label_int32 won't be shuffled.
        keep_inds = np.arange(len(roidb['boxes']))
        fg_rois_per_this_image = len(roidb['boxes'])
        bg_rois_per_this_image = 0

    if cfg.MODEL.TAGGING:
        assert bg_rois_per_this_image == 0

    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(sampled_boxes,
                                        gt_boxes[gt_assignments, :],
                                        sampled_labels)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0,
                                    dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     bbox_targets=bbox_targets,
                     bbox_inside_weights=bbox_inside_weights,
                     bbox_outside_weights=bbox_outside_weights)

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb,
                                               im_scale, batch_idx)

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(blob_dict, roidb,
                                                       fg_rois_per_image,
                                                       fg_inds, im_scale,
                                                       batch_idx)

    return blob_dict
def add_keypoint_rcnn_blobs(
        blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx):
    # Note: gt_inds must match how they're computed in
    # datasets.json_dataset._merge_proposal_boxes_into_roidb
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    max_overlaps = roidb['max_overlaps']
    gt_keypoints = roidb['gt_keypoints']

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
    kp_fg_inds = np.where(
        np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible))[0]

    kp_fg_rois_per_this_image = np.minimum(
        fg_rois_per_image, kp_fg_inds.size)
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(
            kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False)

    if kp_fg_inds.shape[0] == 0:
        kp_fg_inds = gt_inds
    sampled_fg_rois = roidb['boxes'][kp_fg_inds]
    box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

    num_keypoints = gt_keypoints.shape[-1]
    sampled_keypoints = -np.ones(
        (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints),
        dtype=gt_keypoints.dtype)
    for ii in range(len(sampled_fg_rois)):
        ind = box_to_gt_ind_map[ii]
        if ind >= 0:
            sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
            # assert np.sum(sampled_keypoints[ii, 2, :]) > 0

    all_heats = []
    all_weights = []
    time_dim = sampled_fg_rois.shape[-1] // 4
    per_frame_nkps = num_keypoints // time_dim
    for t in range(time_dim):
        heats, weights = keypoint_utils.keypoints_to_heatmap_labels(
            sampled_keypoints[..., t * per_frame_nkps: (t + 1) * per_frame_nkps],
            sampled_fg_rois[..., t * 4: (t + 1) * 4])
        all_heats.append(heats)
        all_weights.append(weights)
    heats = np.concatenate(all_heats, axis=-1)
    weights = np.concatenate(all_weights, axis=-1)

    shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS * time_dim, 1)
    heats = heats.reshape(shape)
    weights = weights.reshape(shape)

    sampled_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1))
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))

    blobs['keypoint_rois'] = sampled_fg_rois
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights
Exemple #22
0
def add_refine_global_mask_blobs(blobs, sampled_boxes, roidb, im_scale,
                                 batch_idx, data):
    """Add RefineNet Mask specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    dst_scale = cfg.REFINENET.SPATIAL_SCALE
    polys_gt_inds = np.where((roidb['gt_classes'] > 0)
                             & (roidb['is_crowd'] == 0))[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    fg_inds = np.where(blobs['labels_int32'] > 0)[0]
    roi_has_mask = blobs['labels_int32'].copy()
    roi_has_mask[roi_has_mask > 0] = 1

    # Define size variables
    inp_h, inp_w = data.shape[2], data.shape[3]
    out_h, out_w = int(inp_h * dst_scale), int(inp_w * dst_scale)

    if fg_inds.shape[0] > 0:
        # Class labels for the foreground rois
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], out_h, out_w), int32=True)

        # Find overlap between all foreground rois and the bounding boxes
        # enclosing each segmentation
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False))
        # Map from each fg rois to the index of the mask with highest overlap
        # (measured by bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # narrow scale and size
        scale = im_scale * dst_scale
        im_h, im_w = roidb['height'], roidb['width']
        im_label_h, im_label_w = int(im_h * scale), int(im_w * scale)

        # add fg targets
        for i in range(rois_fg.shape[0]):
            fg_polys_ind = fg_polys_inds[i]
            poly_gt = polys_gt[fg_polys_ind]
            roi_fg = rois_fg[i]
            # Rasterize the portion of the polygon mask within the given fg roi
            # to an im_label_h x im_label_w binary image
            mask = segm_utils.polys_to_mask_scaled(poly_gt, im_h, im_w, scale)
            mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
            masks[i, 0:im_label_h, 0:im_label_w] = mask

        masks = np.reshape(masks, (-1, out_h * out_w))

    else:  # If there are no fg masks (it does happen)
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # rois_fg is actually one background roi, but that's ok because ...
        rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # We give it an -1's blob (ignore label)
        masks = -blob_utils.ones((1, out_h * out_w), int32=True)
        # We label it with class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # Mark that the first roi has a mask
        roi_has_mask[0] = 1

    if cfg.MRCNN.CLS_SPECIFIC_MASK:
        masks = _expand_to_class_specific_mask_targets(masks,
                                                       mask_class_labels)

    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    # Update blobs dict with Refine-Net blobs
    blobs['refined_mask_rois'] = rois_fg
    blobs['roi_has_refined_mask_int32'] = roi_has_mask
    blobs['refined_masks_int32'] = masks
    def _forward(self,
                 data,
                 im_info,
                 do_vis=False,
                 dataset_name=None,
                 roidb=None,
                 use_gt_labels=False,
                 **rpn_kwargs):
        im_data = data
        if self.training:
            roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb))
        if dataset_name is not None:
            dataset_name = blob_utils.deserialize(dataset_name)
        else:
            dataset_name = cfg.TRAIN.DATASETS[
                0] if self.training else cfg.TEST.DATASETS[
                    0]  # assuming only one dataset per run

        device_id = im_data.get_device()

        return_dict = {}  # A dict to collect return variables

        blob_conv = self.Conv_Body(im_data)

        if self.training:
            gt_rois = roidb[0]['boxes'] * im_info[0, 2].data.cpu().numpy()
            gt_classes = roidb[0]['gt_classes']
            sbj_gt_boxes = roidb[0]['sbj_gt_boxes']
            obj_gt_boxes = roidb[0]['obj_gt_boxes']

        rpn_ret = self.RPN(blob_conv, im_info, roidb)

        if cfg.FPN.FPN_ON:
            # Retain only the blobs that will be used for RoI heads. `blob_conv` may include
            # extra blobs that are used for RPN proposals, but not for RoI heads.
            blob_conv = blob_conv[-self.num_roi_levels:]

        if cfg.MODEL.SHARE_RES5 and self.training:
            box_feat, res5_feat = self.Box_Head(blob_conv,
                                                rpn_ret,
                                                use_relu=True)
        else:
            box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True)
        cls_score, bbox_pred = self.Box_Outs(box_feat)

        # now go through the predicate branch
        use_relu = False if cfg.MODEL.NO_FC7_RELU else True
        if self.training:
            fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0]
            det_rois = rpn_ret['rois'][fg_inds]
            det_labels = rpn_ret['labels_int32'][fg_inds]
            det_scores = F.softmax(cls_score[fg_inds], dim=1)
            rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info,
                                 dataset_name, roidb)

            select_inds = np.array([])
            repeated_batch_idx = 0 * blob_utils.ones((gt_rois.shape[0], 1))
            select_rois = np.hstack((repeated_batch_idx, gt_rois))
            select_feat = self.detector_feature_map(blob_conv,
                                                    select_rois,
                                                    use_relu=True)
            select_dists, _ = self.Box_Outs(select_feat)
            select_dists = F.softmax(select_dists, -1)
            select_labels = select_dists[:,
                                         1:].max(-1)[1].data.cpu().numpy() + 1
            select_gt_labels = gt_classes

            sbj_feat = self.Box_Head_sg(blob_conv,
                                        rel_ret,
                                        rois_name='sbj_rois',
                                        use_relu=True)
            obj_feat = self.Box_Head_sg(blob_conv,
                                        rel_ret,
                                        rois_name='obj_rois',
                                        use_relu=True)

        else:
            if roidb is not None:
                im_scale = im_info.data.numpy()[:, 2][0]
                im_w = im_info.data.numpy()[:, 1][0]
                im_h = im_info.data.numpy()[:, 0][0]
                gt_rois = roidb['boxes'] * im_scale

                sbj_boxes = roidb['sbj_gt_boxes']
                obj_boxes = roidb['obj_gt_boxes']
                sbj_rois = sbj_boxes * im_scale
                obj_rois = obj_boxes * im_scale
                repeated_batch_idx = 0 * blob_utils.ones(
                    (sbj_rois.shape[0], 1))
                sbj_rois = np.hstack((repeated_batch_idx, sbj_rois))
                obj_rois = np.hstack((repeated_batch_idx, obj_rois))

                if gt_rois.size > 0:
                    repeated_batch_idx = 0 * blob_utils.ones(
                        (gt_rois.shape[0], 1))
                    select_rois = np.hstack((repeated_batch_idx, gt_rois))

                    select_feat = self.detector_feature_map(blob_conv,
                                                            select_rois,
                                                            use_relu=True)
                    select_dists, _ = self.Box_Outs(select_feat)
                    select_labels = self.get_nms_preds(select_dists,
                                                       select_rois,
                                                       softmax=False)
                    select_inds = np.arange(0, select_labels.shape[0]).astype(
                        np.int64)

                    rel_ret = self.EdgePN(select_rois, select_labels,
                                          select_dists, im_info, dataset_name,
                                          None)

                    det_feat_sg = self.Box_Head_sg(blob_conv,
                                                   rel_ret,
                                                   rois_name='det_rois',
                                                   use_relu=True)

                    det_labels = select_labels.copy()
                    det_scores = select_dists[:, 1:].max(
                        -1)[0].data.cpu().numpy()
                    min_ious = np.minimum(
                        box_utils.bbox_overlaps(
                            select_rois[:, 1:][rel_ret['sbj_inds']],
                            sbj_rois[:, 1:]),
                        box_utils.bbox_overlaps(
                            select_rois[:, 1:][rel_ret['obj_inds']],
                            obj_rois[:, 1:]))
                    match_indices = np.where(min_ious.max(-1) >= 0.5)[0]
                    rel_ret['sbj_inds'], rel_ret['obj_inds'], rel_ret['sbj_rois'], rel_ret['obj_rois'],\
                    rel_ret['rel_rois'], rel_ret['sbj_labels'], rel_ret['obj_labels'], rel_ret['sbj_scores'], \
                    rel_ret['obj_scores'] = rel_ret['sbj_inds'][match_indices], \
                    rel_ret['obj_inds'][match_indices], rel_ret['sbj_rois'][match_indices], \
                    rel_ret['obj_rois'][match_indices], rel_ret['rel_rois'][match_indices], \
                    rel_ret['sbj_labels'][match_indices], rel_ret['obj_labels'][match_indices], \
                    rel_ret['sbj_scores'][match_indices], rel_ret['obj_scores'][match_indices]

                    sbj_feat = det_feat_sg[rel_ret['sbj_inds']]
                    obj_feat = det_feat_sg[rel_ret['obj_inds']]

                else:
                    score_thresh = cfg.TEST.SCORE_THRESH
                    while score_thresh >= -1e-06:  # a negative value very close to 0.0
                        det_rois, det_labels, det_scores = \
                            self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                        rel_ret = self.RelPN(det_rois, det_labels, det_scores,
                                             im_info, dataset_name, None)
                        valid_len = len(rel_ret['rel_rois'])
                        if valid_len > 0:
                            break
                        logger.info(
                            'Got {} rel_rois when score_thresh={}, changing to {}'
                            .format(valid_len, score_thresh,
                                    score_thresh - 0.01))
                        score_thresh -= 0.01
                    det_feat = None
                    # #
                    vaild_inds = np.unique(
                        np.concatenate(
                            (rel_ret['sbj_inds'], rel_ret['obj_inds']), 0))
                    vaild_sort_inds = vaild_inds[np.argsort(
                        -det_scores[vaild_inds])]

                    select_inds = vaild_sort_inds[:10]
                    select_rois = det_rois[select_inds]

                    det_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='det_rois',
                                             use_relu=True)
                    det_dists, _ = self.Box_Outs(det_feat)
                    select_dists = det_dists[select_inds]
                    select_labels = det_labels[select_inds].copy()
            else:
                score_thresh = cfg.TEST.SCORE_THRESH
                while score_thresh >= -1e-06:  # a negative value very close to 0.0
                    det_rois, det_labels, det_scores = \
                        self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh)
                    rel_ret = self.RelPN(det_rois, det_labels, det_scores,
                                         im_info, dataset_name, roidb)
                    valid_len = len(rel_ret['rel_rois'])
                    if valid_len > 0:
                        break
                    logger.info(
                        'Got {} rel_rois when score_thresh={}, changing to {}'.
                        format(valid_len, score_thresh, score_thresh - 0.01))
                    score_thresh -= 0.01
                det_feat = None
                vaild_inds = np.unique(
                    np.concatenate((rel_ret['sbj_inds'], rel_ret['obj_inds']),
                                   0))

                vaild_sort_inds = vaild_inds[np.argsort(
                    -det_scores[vaild_inds])]

                select_inds = vaild_sort_inds
                select_rois = det_rois[select_inds]

                det_feat_sg = self.Box_Head_sg(blob_conv,
                                               rel_ret,
                                               rois_name='det_rois',
                                               use_relu=True)
                sbj_feat = det_feat_sg[rel_ret['sbj_inds']]
                obj_feat = det_feat_sg[rel_ret['obj_inds']]

                if det_feat is None:
                    det_feat = self.Box_Head(blob_conv,
                                             rel_ret,
                                             rois_name='det_rois',
                                             use_relu=True)
                det_dists, _ = self.Box_Outs(det_feat)
                select_dists = det_dists[select_inds]
                select_labels = det_labels[select_inds].copy()

        if select_inds.size > 2 or self.training:
            # if False:
            entity_fmap = self.obj_feature_map(blob_conv.detach(),
                                               select_rois,
                                               use_relu=True)
            entity_feat0 = self.merge_obj_feats(entity_fmap, select_rois,
                                                select_dists.detach(), im_info)
            edge_ret = self.EdgePN(select_rois, select_labels, select_dists,
                                   im_info, dataset_name, None)
            edge_feat = self.get_phr_feats(
                self.visual_rep(blob_conv,
                                edge_ret,
                                device_id,
                                use_relu=use_relu))
            edge_inds = np.stack((edge_ret['sbj_rois'][:, 0].astype(edge_ret['sbj_inds'].dtype), \
                                      edge_ret['sbj_inds'], edge_ret['obj_inds']), -1)

            im_inds = select_rois[:, 0].astype(edge_inds.dtype)
            entity_feat = self.obj_mps1(entity_feat0, edge_feat, im_inds,
                                        edge_inds)
            entity_feat = self.obj_mps2(entity_feat, edge_feat, im_inds,
                                        edge_inds)

            entity_cls_score = self.ObjClassifier(entity_feat)

            if not self.training:
                select_labels_pred = self.get_nms_preds(
                    entity_cls_score, select_rois)

                det_labels[select_inds] = select_labels_pred
                if use_gt_labels:
                    det_labels[select_inds] = roidb['gt_classes']
                select_twod_inds = np.arange(0, select_labels_pred.shape[
                    0]) * cfg.MODEL.NUM_CLASSES + select_labels_pred
                select_scores = F.softmax(
                    entity_cls_score,
                    -1).view(-1)[select_twod_inds].data.cpu().numpy()

                det_scores[select_inds] = select_scores
                if use_gt_labels:
                    det_scores[select_inds] = np.ones_like(select_scores)

        rel_feat = self.visual_rep(blob_conv,
                                   rel_ret,
                                   device_id,
                                   use_relu=use_relu)

        if not self.training:
            sbj_labels = det_labels[rel_ret['sbj_inds']]
            obj_labels = det_labels[rel_ret['obj_inds']]
            rel_ret['sbj_labels'] = det_labels[rel_ret['sbj_inds']]
            rel_ret['obj_labels'] = det_labels[rel_ret['obj_inds']]
            rel_ret['sbj_scores'] = det_scores[rel_ret['sbj_inds']]
            rel_ret['obj_scores'] = det_scores[rel_ret['obj_inds']]
        else:
            sbj_labels = rel_ret['all_sbj_labels_int32'] + 1
            obj_labels = rel_ret['all_obj_labels_int32'] + 1

        sbj_embed = self.ori_embed[sbj_labels].clone().cuda(device_id)
        obj_embed = self.ori_embed[obj_labels].clone().cuda(device_id)
        sbj_pos = torch.from_numpy(
            self.get_obj_pos(rel_ret['sbj_rois'],
                             im_info)).float().cuda(device_id)
        obj_pos = torch.from_numpy(
            self.get_obj_pos(rel_ret['obj_rois'],
                             im_info)).float().cuda(device_id)

        prod = self.sbj_map(torch.cat(
            (sbj_feat, sbj_embed, sbj_pos), -1)) * self.obj_map(
                torch.cat((obj_feat, obj_embed, obj_pos), -1))

        prd_scores = self.rel_compress(rel_feat * prod)

        if cfg.MODEL.USE_FREQ_BIAS:

            sbj_labels = torch.from_numpy(sbj_labels).long().cuda(device_id)
            obj_labels = torch.from_numpy(obj_labels).long().cuda(device_id)

            prd_bias_scores = self.freq_bias.rel_index_with_labels(
                torch.stack((sbj_labels - 1, obj_labels - 1), 1))

            prd_scores += prd_bias_scores

        if not self.training:
            prd_scores = F.softmax(prd_scores, -1)

        if self.training:
            return_dict['losses'] = {}
            return_dict['metrics'] = {}

            imp_gamma = get_importance_factor(select_rois, sbj_gt_boxes,
                                              obj_gt_boxes, im_info)
            # rpn loss
            rpn_kwargs.update(
                dict((k, rpn_ret[k]) for k in rpn_ret.keys()
                     if (k.startswith('rpn_cls_logits')
                         or k.startswith('rpn_bbox_pred'))))
            loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses(
                **rpn_kwargs)
            if cfg.FPN.FPN_ON:
                for i, lvl in enumerate(
                        range(cfg.FPN.RPN_MIN_LEVEL,
                              cfg.FPN.RPN_MAX_LEVEL + 1)):
                    return_dict['losses']['loss_rpn_cls_fpn%d' %
                                          lvl] = loss_rpn_cls[i]
                    return_dict['losses']['loss_rpn_bbox_fpn%d' %
                                          lvl] = loss_rpn_bbox[i]
            else:
                return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls
                return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox
            # bbox loss
            loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses(
                cls_score, bbox_pred, rpn_ret['labels_int32'],
                rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'],
                rpn_ret['bbox_outside_weights'])
            return_dict['losses']['loss_cls'] = loss_cls
            return_dict['losses']['loss_bbox'] = loss_bbox
            return_dict['metrics']['accuracy_cls'] = accuracy_cls

            loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses(
                prd_scores, rel_ret['all_prd_labels_int32'])
            return_dict['losses']['loss_cls_prd'] = loss_cls_prd
            return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd

            loss_cls_entity, accuracy_cls_entity = refine_obj_feats.entity_losses_imp(
                entity_cls_score, select_gt_labels, imp_gamma)
            return_dict['losses']['loss_cls_entity'] = loss_cls_entity
            return_dict['metrics']['accuracy_cls_entity'] = accuracy_cls_entity

            # pytorch0.4 bug on gathering scalar(0-dim) tensors
            for k, v in return_dict['losses'].items():
                return_dict['losses'][k] = v.unsqueeze(0)
            for k, v in return_dict['metrics'].items():
                return_dict['metrics'][k] = v.unsqueeze(0)
        else:
            # Testing
            return_dict['sbj_rois'] = rel_ret['sbj_rois']
            return_dict['obj_rois'] = rel_ret['obj_rois']
            return_dict['sbj_labels'] = rel_ret['sbj_labels']
            return_dict['obj_labels'] = rel_ret['obj_labels']
            return_dict['sbj_scores'] = rel_ret['sbj_scores']
            return_dict['obj_scores'] = rel_ret['obj_scores']
            return_dict['prd_scores'] = prd_scores

            if do_vis:
                return_dict['blob_conv'] = blob_conv

        return return_dict
Exemple #24
0
def _sample_rois(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    max_overlaps = roidb['max_overlaps']
    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]

    if cfg.TRAIN.JOINT_SELECTIVE_FG:
        # EDIT: 'Selective foreground sampling from dataset-0'
        dataset_idx = roidb['dataset_id'][0]
        if dataset_idx == 0:
            print('Selective foreground sampling')
            # Only fg rois in minibatch for "dataset-0":
            fg_rois_per_image = int(
                np.round(cfg.TRAIN.FG_FRACTION * cfg.TRAIN.BATCH_SIZE_PER_IM))
            fg_rois_per_this_image = np.minimum(fg_rois_per_image,
                                                fg_inds.size)
            # Sample foreground regions without replacement
            if fg_inds.size > 0:
                fg_inds = npr.choice(fg_inds,
                                     size=fg_rois_per_this_image,
                                     replace=False)
            # If rois_per_image = fg_rois_per_this_image, then
            # bg_rois_per_this_image = 0 (ensures no bg in batch)
            rois_per_image = fg_rois_per_this_image
        else:
            # for "dataset-1", ensure correct ratio of fg:bg
            fg_rois_per_image = int(
                np.round(cfg.TRAIN.FG_FRACTION * cfg.TRAIN.BATCH_SIZE_PER_IM))

            bg_rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM -
                                    fg_rois_per_image)
            # Increase the batchsize (roi per img) to accomodate twice the number of bg rois
            rois_per_image = int(
                cfg.TRAIN.BATCH_SIZE_PER_IM) + bg_rois_per_image
            fg_rois_per_this_image = np.minimum(fg_rois_per_image,
                                                fg_inds.size)
            # Sample foreground regions without replacement
            if fg_inds.size > 0:
                fg_inds = npr.choice(fg_inds,
                                     size=fg_rois_per_this_image,
                                     replace=False)
    else:
        # Default fg:bg rois sampling
        rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
        fg_rois_per_image = int(
            np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
        # Guard against the case when an image has fewer than fg_rois_per_image
        # foreground RoIs
        fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
        # Sample foreground regions without replacement
        if fg_inds.size > 0:
            fg_inds = npr.choice(fg_inds,
                                 size=fg_rois_per_this_image,
                                 replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI)
                       & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds,
                             size=bg_rois_per_this_image,
                             replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if cfg.TRAIN.GT_SCORES:
        # EDIT: soft labels
        sampled_scores = roidb['max_scores'][keep_inds]
        sampled_gt_source = roidb['max_gt_source'][keep_inds]
        sampled_scores[fg_rois_per_this_image:] = 0
        sampled_gt_source[fg_rois_per_this_image:] = 0
        if roidb['dataset_id'][0] == 0:
            # sanity-check for the unlabeled dataset case (assumed "dataset-0")
            assert all(
                (sampled_scores >= 0) == (sampled_labels >=
                                          0))  # TODO: Check >= instead of >
            assert (len(sampled_gt_source) == len(sampled_scores))

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(sampled_boxes,
                                        gt_boxes[gt_assignments, :],
                                        sampled_labels)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0,
                                    dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     bbox_targets=bbox_targets,
                     bbox_inside_weights=bbox_inside_weights,
                     bbox_outside_weights=bbox_outside_weights)

    # EDIT: joint training
    if cfg.TRAIN.JOINT_TRAINING:
        blob_dict['dataset_id'] = np.full_like(sampled_labels,
                                               roidb['dataset_id'][0],
                                               dtype=np.int32)

    # EDIT: soft labels
    if cfg.TRAIN.GT_SCORES:
        blob_dict['gt_scores'] = sampled_scores.astype(np.float32, copy=False)
        blob_dict['gt_source'] = sampled_gt_source.astype(np.int32, copy=False)
        blob_dict['dataset_id'] = np.full_like(sampled_scores,
                                               roidb['dataset_id'][0],
                                               dtype=np.int32)

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb,
                                               im_scale, batch_idx)

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(blob_dict, roidb,
                                                       fg_rois_per_image,
                                                       fg_inds, im_scale,
                                                       batch_idx)

    return blob_dict
def sample_rois(roidb, im_scale, batch_idx, pos_iou):
    """Generate a random sample of RoIs comprising foreground and background examples.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION *
                                     rois_per_image))  # 0.25 x 512 by default
    max_overlaps = roidb['max_overlaps']

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= pos_iou)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds,
                             size=fg_rois_per_this_image,
                             replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < pos_iou) &  # [0.0, 0.5) by default
                       (max_overlaps >= 0))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds,
                             size=bg_rois_per_this_image,
                             replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]
    # pdb.set_trace()

    if 'bbox_targets' not in roidb:
        # pdb.set_trace()
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]

        # print(gt_inds)
        # print(roidb['box_to_gt_ind_map'])
        # ForkedPdb().set_trace()

        if len(gt_inds) > 0:  # LJY
            # print(gt_inds)
            # print(roidb['box_to_gt_ind_map'])
            # print(roidb['box_to_gt_ind_map'][keep_inds])
            gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
            bbox_targets = compute_targets(sampled_boxes,
                                           gt_boxes[gt_assignments, :],
                                           sampled_labels)
            bbox_targets, bbox_inside_weights = expand_bbox_targets(
                bbox_targets)
        else:  # all-negative image
            # generate dummy gt boxes
            gt_boxes = sampled_boxes.copy()
            # pdb.set_trace()
            bbox_targets = compute_targets(sampled_boxes, gt_boxes,
                                           sampled_labels)
            bbox_targets, bbox_inside_weights = expand_bbox_targets(
                bbox_targets)
            # pdb.set_trace()
    else:
        # LJ 不会进入
        pdb.set_trace()
        bbox_targets, bbox_inside_weights = expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0,
                                    dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     bbox_targets=bbox_targets,
                     bbox_inside_weights=bbox_inside_weights,
                     bbox_outside_weights=bbox_outside_weights)

    return blob_dict
Exemple #26
0
def _sample_rois(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(
            fg_inds, size=fg_rois_per_this_image, replace=False
        )

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where(
        (max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
        (max_overlaps >= cfg.TRAIN.BG_THRESH_LO)
    )[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(
            bg_inds, size=bg_rois_per_this_image, replace=False
        )

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(
            sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels
        )
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :]
        )

    bbox_outside_weights = np.array(
        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype
    )

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(
        labels_int32=sampled_labels.astype(np.int32, copy=False),
        rois=sampled_rois,
        bbox_targets=bbox_targets,
        bbox_inside_weights=bbox_inside_weights,
        bbox_outside_weights=bbox_outside_weights
    )

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(
            blob_dict, sampled_boxes, roidb, im_scale, batch_idx
        )

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
            blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
        )

    return blob_dict
Exemple #27
0
def add_refine_local_mask_blobs(blobs, sampled_boxes, roidb, im_scale,
                                batch_idx, data):
    """Add RefineNet Mask specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    M = cfg.REFINENET.RESOLUTION
    up_scale = cfg.REFINENET.UP_SCALE
    polys_gt_inds = np.where((roidb['gt_classes'] > 0)
                             & (roidb['is_crowd'] == 0))[0]
    gt_classes = roidb['gt_classes'][polys_gt_inds]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    fg_inds = np.where(blobs['labels_int32'] > 0)[0]
    roi_has_mask = blobs['labels_int32'].copy()
    roi_has_mask[roi_has_mask > 0] = 1

    # Define size variables
    inp_h, inp_w = data.shape[2], data.shape[3]
    pad_img_h, pad_img_w = inp_h / im_scale, inp_w / im_scale

    if fg_inds.shape[0] > 0:
        # Class labels for the foreground rois
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)

        # Find overlap between all foreground rois and the bounding boxes
        # enclosing each segmentation
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False))
        # Map from each fg rois to the index of the mask with highest overlap
        # (measured by bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # Expand the foreground rois by a factor of up_scale and
        # clip by the padded image boundary
        pad_rois_fg = box_utils.expand_boxes(rois_fg, up_scale)
        pad_rois_fg = box_utils.clip_boxes_to_image(pad_rois_fg, pad_img_h,
                                                    pad_img_w)

        if cfg.REFINENET.ONLY_USE_CROWDED_SAMPLES:
            # Only use crowded samples to train the RefineNet
            THRES = cfg.REFINENET.OVERLAP_THRESHOLD
            for i in range(rois_fg.shape[0]):
                overlap = overlaps_bbfg_bbpolys[i]
                if np.sum(overlap > THRES) > 1:
                    # if has multiple instances overlapped, use it for training
                    fg_polys_ind = fg_polys_inds[i]
                    poly_gt = polys_gt[fg_polys_ind]
                    pad_roi_fg = pad_rois_fg[i]
                    # Rasterize the portion of the polygon mask within the given fg roi
                    # to an M x M binary image
                    mask = segm_utils.polys_to_mask_wrt_box(
                        poly_gt, pad_roi_fg, M)
                    mask = np.array(mask > 0,
                                    dtype=np.int32)  # Ensure it's binary
                    masks[i, :] = np.reshape(mask, M**2)

                else:  # Only one instance, then set label to be -1 (ignored)
                    masks[i, :] = -1
                    mask_class_labels[i] = 0
        elif cfg.REFINENET.ASSIGN_LARGER_WEIGHT_FOR_CROWDED_SAMPLES:
            loss_weights = blob_utils.ones((rois_fg.shape[0], ))
            for i in range(rois_fg.shape[0]):
                fg_polys_ind = fg_polys_inds[i]
                poly_gt = polys_gt[fg_polys_ind]
                pad_roi_fg = pad_rois_fg[i]
                class_label = mask_class_labels[i]

                # Rasterize the portion of the polygon mask within the given
                # fg roi to an M x M binary image
                mask = segm_utils.polys_to_mask_wrt_box(poly_gt, pad_roi_fg, M)
                mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
                masks[i, :] = np.reshape(mask, M**2)

                # And now determine the weight for each roi. If any instance
                # that is of the same class as the RoI, then we expect it to
                # be a hard sample and assigns a larger weight for this RoI
                for j in range(len(polys_gt)):
                    if j == fg_polys_ind:
                        continue
                    if gt_classes[
                            j] == class_label:  # only same class is valid
                        mask = segm_utils.polys_to_mask_wrt_box(
                            polys_gt[j], pad_roi_fg, M)
                        # and check if has anypart fall inside the bbox
                        is_inside_bbox = (np.sum(mask) > 0)
                        if is_inside_bbox:
                            loss_weights[i] = cfg.REFINENET.WEIGHT_LOSS_CROWDED
                            break  # early stop

        else:
            # add fg targets
            for i in range(rois_fg.shape[0]):
                fg_polys_ind = fg_polys_inds[i]
                poly_gt = polys_gt[fg_polys_ind]
                pad_roi_fg = pad_rois_fg[i]
                # Rasterize the portion of the polygon mask within the given fg roi
                # to an M x M binary image
                mask = segm_utils.polys_to_mask_wrt_box(poly_gt, pad_roi_fg, M)
                mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
                masks[i, :] = np.reshape(mask, M**2)

    else:  # If there are no fg masks (it does happen)
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # pad_rois_fg is actually one background roi, but that's ok because ...
        pad_rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # We give it an -1's blob (ignore label)
        masks = -blob_utils.ones((1, M**2), int32=True)
        # We label it with class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # Mark that the first roi has a mask
        roi_has_mask[0] = 1

    if cfg.MRCNN.CLS_SPECIFIC_MASK:
        masks = _expand_to_class_specific_mask_targets(masks,
                                                       mask_class_labels)

    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
    pad_rois_fg = (pad_rois_fg.astype(np.float32)) * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((pad_rois_fg.shape[0], 1))
    pad_rois_fg = np.hstack((repeated_batch_idx, pad_rois_fg)).astype(np.int32)

    # Update blobs dict with Refine-Net blobs
    blobs['refined_mask_rois'] = pad_rois_fg
    blobs['roi_has_refined_mask_int32'] = roi_has_mask
    blobs['refined_masks_int32'] = masks

    if cfg.REFINENET.ASSIGN_LARGER_WEIGHT_FOR_CROWDED_SAMPLES:
        blobs['loss_weights'] = loss_weights
Exemple #28
0
def _sample_rois_gan(roidb, im_scale, batch_idx, flags):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    assert isinstance(flags, ModeFlags) is True

    # gt_boxes and sample such that they fulfill threshold criterion
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    gt_boxes = roidb['boxes'][gt_inds, :]

    if cfg.DEBUG:
        logger.info("sample from {} gt boxes".format(len(gt_boxes)))
        areas_gt, _ = box_utils.boxes_area(gt_boxes)
        areas_gt = np.sqrt(areas_gt)
        print("gt-boxes: area_thres: {} vs areas: {}".format(
            cfg.GAN.AREA_THRESHOLD, areas_gt))

    gt_keep_inds = []
    if cfg.GAN.AREA_THRESHOLD > 0:
        area_thres = 1.0 * cfg.GAN.AREA_THRESHOLD * cfg.GAN.AREA_THRESHOLD  # no scaling, as rois are scaled latter

        if flags.fake_mode:
            #  for fake samples: keep only samples with area < area-threshold
            gt_keep_inds = gt_inds[box_utils.filter_large_boxes_area(
                gt_boxes, max_area=area_thres)]
        elif flags.real_mode:
            # for real samples: keep only samples with area >= area-threshold
            gt_keep_inds = gt_inds[box_utils.filter_small_boxes_area(
                gt_boxes, min_area=area_thres)]
        elif flags.real_fake_mode:
            gt_keep_inds = gt_inds

    if flags.train_generator:
        rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_G)
        fg_rois_per_image = int(
            np.round(cfg.GAN.TRAIN.FG_FRACTION_G * rois_per_image))
    elif flags.train_discriminator:  # discriminator
        rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_D)
        fg_rois_per_image = int(
            np.round(cfg.GAN.TRAIN.FG_FRACTION_D * rois_per_image))
    elif flags.train_pre:
        rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_PRE)
        fg_rois_per_image = int(
            np.round(cfg.GAN.TRAIN.FG_FRACTION_PRE * rois_per_image))

    max_overlaps = roidb['max_overlaps']

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]

    # with area-threshold, only select indices of boxes, whose corresponding ground-truth-box fulfills criterion
    # i.e. whose corresponding index to gt-box is in gt_keep_inds

    if cfg.GAN.AREA_THRESHOLD > 0:

        #if cfg.DEBUG:
        #    fg_boxes = gt_boxes[gt_inds[roidb['box_to_gt_ind_map'][fg_inds]], :]
        #    areas_fg, _ = box_utils.boxes_area(fg_boxes)
        #    areas_fg = np.sqrt(areas_fg)
        #    print("fg-before: area_thres: {} vs areas: {}".format(cfg.GAN.AREA_THRESHOLD, areas_fg))

        fg_inds = np.asarray([
            x for x in fg_inds
            if gt_inds[roidb['box_to_gt_ind_map'][x]] in gt_keep_inds
        ]).astype(int)

        if cfg.DEBUG:
            fg_boxes = gt_boxes[
                gt_inds[roidb['box_to_gt_ind_map'][fg_inds]], :]
            areas_fg, _ = box_utils.boxes_area(fg_boxes)
            areas_fg = np.sqrt(areas_fg)
            print("fg-after: area_thres: {} vs areas: {}".format(
                cfg.GAN.AREA_THRESHOLD, areas_fg))

    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, int(fg_inds.size))

    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        fg_inds = npr.choice(fg_inds,
                             size=fg_rois_per_this_image,
                             replace=False)

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI)
                       & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(bg_inds,
                             size=bg_rois_per_this_image,
                             replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)

    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(sampled_boxes,
                                        gt_boxes[gt_assignments, :],
                                        sampled_labels)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(bbox_inside_weights > 0,
                                    dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    if not cfg.RPN.RPN_ON:  # FAST-RCNN training
        # need to unsqueeze things for functionality in loader / minibatch.py ...
        sampled_rois = np.expand_dims(sampled_rois, axis=0)
        sampled_labels = np.expand_dims(sampled_labels, axis=0)
        bbox_targets = np.expand_dims(bbox_targets, axis=0)
        bbox_outside_weights = np.expand_dims(bbox_outside_weights, axis=0)
        bbox_inside_weights = np.expand_dims(bbox_inside_weights, axis=0)

    # Base Fast R-CNN blobs
    blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False),
                     rois=sampled_rois,
                     bbox_targets=bbox_targets,
                     bbox_inside_weights=bbox_inside_weights,
                     bbox_outside_weights=bbox_outside_weights)

    return blob_dict
def _sample_rois_balance_sample(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM)
    fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image))
    max_overlaps = roidb['max_overlaps']

    # Select foreground RoIs as those with >= FG_THRESH overlap
    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
    # Guard against the case when an image has fewer than fg_rois_per_image
    # foreground RoIs
    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)

    # Balance Sample Start
    gt_label = roidb['gt_classes']
    gt_assignment = roidb['box_to_gt_ind_map']
    gt_set = np.unique(gt_label)
    # pdb.set_trace()
    gt_set = gt_set[np.where(gt_set>0)]

    sample_count = {}
    roi_max_num = {}
    gt_pair = np.zeros((2, len(gt_set)), dtype=np.uint8)

    for temp_index, temp_label in enumerate(gt_set):
        sample_count[temp_label] = 0
        temp_num1 = len(np.where(gt_label==temp_label)[0])
        gt_pair[0, temp_index] = temp_label
        gt_pair[1, temp_index] = temp_num1

    average_label_num = math.ceil(fg_rois_per_this_image / float(len(gt_set)))
    gt_num_sort = np.argsort(gt_pair[1, :]) 
    fg_remain = fg_rois_per_this_image
    # print(len(gt_num_sort))
    for ii in range(len(gt_num_sort)):
        dispatch = gt_pair[1, ii] if gt_pair[1, ii] <= average_label_num else average_label_num
        roi_max_num[gt_pair[0, ii]] = dispatch
        fg_remain -= dispatch
        if len(gt_num_sort)-ii-1 == 0:
            continue
        average_label_num = math.ceil(fg_remain / float(len(gt_num_sort)-ii-1))
        

    new_fg_inds = []

    # Sample foreground regions without replacement
    if fg_inds.size > 0:
        # print(fg_inds)
        # pdb.set_trace()
        np.random.shuffle(fg_inds)
        # print(fg_inds)
        # input()
        for ii in range(len(fg_inds)):
            label_temp = gt_label[gt_assignment[fg_inds[ii]]]
            if sample_count[label_temp] < roi_max_num[label_temp]:
                new_fg_inds.append(fg_inds[ii])
                sample_count[label_temp] += 1
        new_fg_inds = np.array(new_fg_inds)
        fg_inds = new_fg_inds


    # Balance Sample End

    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
    # Sample foreground regions without replacement
    if bg_inds.size > 0:
        bg_inds = npr.choice(
            bg_inds, size=bg_rois_per_this_image, replace=False)

    # The indices that we're selecting (both fg and bg)
    keep_inds = np.append(fg_inds, bg_inds)
    # Label is the class each RoI has max overlap with
    sampled_labels = roidb['max_classes'][keep_inds]
    sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
    sampled_boxes = roidb['boxes'][keep_inds]

    if 'bbox_targets' not in roidb:
        gt_inds = np.where(roidb['gt_classes'] > 0)[0]
        gt_boxes = roidb['boxes'][gt_inds, :]
        gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
        bbox_targets = _compute_targets(
            sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels)
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
    else:
        bbox_targets, bbox_inside_weights = _expand_bbox_targets(
            roidb['bbox_targets'][keep_inds, :])

    bbox_outside_weights = np.array(
        bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)

    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_rois = sampled_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1))
    sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))

    # Base Fast R-CNN blobs
    blob_dict = dict(
        labels_int32=sampled_labels.astype(np.int32, copy=False),
        rois=sampled_rois,
        bbox_targets=bbox_targets,
        bbox_inside_weights=bbox_inside_weights,
        bbox_outside_weights=bbox_outside_weights)

    # Optionally add Mask R-CNN blobs
    if cfg.MODEL.MASK_ON:
        roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb,
                                               im_scale, batch_idx)

    # Optionally add Keypoint R-CNN blobs
    if cfg.MODEL.KEYPOINTS_ON:
        roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
            blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx)

    return blob_dict
def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
    """Add Mask R-CNN specific blobs to the input blob dictionary."""
    # Prepare the mask targets by associating one gt mask to each training roi
    # that has a fg (non-bg) class label.
    M = cfg.MRCNN.RESOLUTION
    polys_gt_inds = np.where(
        (roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0)
    )[0]
    polys_gt = [roidb['segms'][i] for i in polys_gt_inds]
    boxes_from_polys = segm_utils.polys_to_boxes(polys_gt)
    # Keep only a subset of classes (set A in the paper) for mask training
    if cfg.TRAIN.MRCNN_FILTER_LABELS:
        keep_label_set = set(cfg.TRAIN.MRCNN_LABELS_TO_KEEP)
        labels_int32 = blobs['labels_int32']
        labels_int32_keep = np.array(
            [(l if l in keep_label_set else 0) for l in labels_int32],
            dtype=labels_int32.dtype)
    else:
        labels_int32_keep = blobs['labels_int32']
    fg_inds = np.where(labels_int32_keep > 0)[0]
    roi_has_mask = labels_int32_keep.copy()
    roi_has_mask[roi_has_mask > 0] = 1

    if fg_inds.shape[0] > 0:
        # Class labels for the foreground rois
        mask_class_labels = blobs['labels_int32'][fg_inds]
        masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True)

        # Find overlap between all foreground rois and the bounding boxes
        # enclosing each segmentation
        rois_fg = sampled_boxes[fg_inds]
        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
            rois_fg.astype(np.float32, copy=False),
            boxes_from_polys.astype(np.float32, copy=False)
        )
        # Map from each fg rois to the index of the mask with highest overlap
        # (measured by bbox overlap)
        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)

        # add fg targets
        for i in range(rois_fg.shape[0]):
            fg_polys_ind = fg_polys_inds[i]
            poly_gt = polys_gt[fg_polys_ind]
            roi_fg = rois_fg[i]
            # Rasterize the portion of the polygon mask within the given fg roi
            # to an M x M binary image
            mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M)
            mask = np.array(mask > 0, dtype=np.int32)  # Ensure it's binary
            masks[i, :] = np.reshape(mask, M**2)
    else:  # If there are no fg masks (it does happen)
        # The network cannot handle empty blobs, so we must provide a mask
        # We simply take the first bg roi, given it an all -1's mask (ignore
        # label), and label it with class zero (bg).
        bg_inds = np.where(blobs['labels_int32'] == 0)[0]
        # rois_fg is actually one background roi, but that's ok because ...
        rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
        # We give it an -1's blob (ignore label)
        masks = -blob_utils.ones((1, M**2), int32=True)
        # We label it with class = 0 (background)
        mask_class_labels = blob_utils.zeros((1, ))
        # Mark that the first roi has a mask
        roi_has_mask[0] = 1

    if cfg.MRCNN.CLS_SPECIFIC_MASK:
        masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels)

    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
    rois_fg *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
    rois_fg = np.hstack((repeated_batch_idx, rois_fg))

    # Update blobs dict with Mask R-CNN blobs
    blobs['mask_rois'] = rois_fg
    blobs['roi_has_mask_int32'] = roi_has_mask
    blobs['masks_int32'] = masks
Exemple #31
0
def _sample_pairs(roidb, im_scale, batch_idx):
    """Generate a random sample of RoIs comprising foreground and background
    examples.
    """
    fg_pairs_per_image = cfg.TRAIN.FG_REL_SIZE_PER_IM
    pairs_per_image = int(
        cfg.TRAIN.FG_REL_SIZE_PER_IM /
        cfg.TRAIN.FG_REL_FRACTION)  # need much more pairs since it's quadratic
    max_pair_overlaps = roidb['max_pair_overlaps']

    gt_pair_inds = np.where(max_pair_overlaps > 1.0 - 1e-4)[0]
    fg_pair_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH)
                            & (max_pair_overlaps <= 1.0 - 1e-4))[0]

    fg_pairs_per_this_image = np.minimum(fg_pairs_per_image,
                                         gt_pair_inds.size + fg_pair_inds.size)
    # Sample foreground regions without replacement
    if fg_pair_inds.size > 0:
        fg_pair_inds = npr.choice(fg_pair_inds,
                                  size=(fg_pairs_per_this_image -
                                        gt_pair_inds.size),
                                  replace=False)
    fg_pair_inds = np.append(fg_pair_inds, gt_pair_inds)

    # Label is the class each RoI has max overlap with
    fg_prd_labels = roidb['max_prd_classes'][fg_pair_inds]
    blob_dict = dict(
        fg_prd_labels_int32=fg_prd_labels.astype(np.int32, copy=False))

    bg_pair_inds = np.where((max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0]

    # Compute number of background RoIs to take from this image (guarding
    # against there being fewer than desired)
    bg_pairs_per_this_image = pairs_per_image - fg_pairs_per_this_image
    bg_pairs_per_this_image = np.minimum(bg_pairs_per_this_image,
                                         bg_pair_inds.size)
    # Sample foreground regions without replacement
    if bg_pair_inds.size > 0:
        bg_pair_inds = npr.choice(bg_pair_inds,
                                  size=bg_pairs_per_this_image,
                                  replace=False)
    keep_pair_inds = np.append(fg_pair_inds, bg_pair_inds)
    all_prd_labels = np.zeros(keep_pair_inds.size, dtype=np.int32)
    all_prd_labels[:fg_pair_inds.
                   size] = fg_prd_labels + 1  # class should start from 1

    blob_dict['all_prd_labels_int32'] = all_prd_labels.astype(np.int32,
                                                              copy=False)
    blob_dict['fg_size'] = np.array(
        [fg_pair_inds.size], dtype=np.int32
    )  # this is used to check if there is at least one fg to learn

    sampled_sbj_boxes = roidb['sbj_boxes'][keep_pair_inds]
    sampled_obj_boxes = roidb['obj_boxes'][keep_pair_inds]
    # Scale rois and format as (batch_idx, x1, y1, x2, y2)
    sampled_sbj_rois = sampled_sbj_boxes * im_scale
    sampled_obj_rois = sampled_obj_boxes * im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (keep_pair_inds.shape[0], 1))
    sampled_sbj_rois = np.hstack((repeated_batch_idx, sampled_sbj_rois))
    sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois))
    blob_dict['sbj_rois'] = sampled_sbj_rois
    blob_dict['obj_rois'] = sampled_obj_rois
    sampled_rel_rois = box_utils.rois_union(sampled_sbj_rois, sampled_obj_rois)
    blob_dict['rel_rois'] = sampled_rel_rois
    if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.USE_SEPARATE_SO_SCORES:
        sbj_labels = roidb['max_sbj_classes'][keep_pair_inds]
        obj_labels = roidb['max_obj_classes'][keep_pair_inds]
        blob_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32,
                                                              copy=False)
        blob_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32,
                                                              copy=False)

    return blob_dict
Exemple #32
0
def add_keypoint_rcnn_blobs_sigmoid(
    blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
):
    """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary."""
    # Note: gt_inds must match how they're computed in
    # datasets.json_dataset._merge_proposal_boxes_into_roidb
    gt_inds = np.where(roidb['gt_classes'] > 0)[0]
    max_overlaps = roidb['max_overlaps']
    gt_keypoints = roidb['gt_keypoints']
    M = cfg.KRCNN.HEATMAP_SIZE

    ind_kp = gt_inds[roidb['box_to_gt_ind_map']]
    within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes'])
    vis_kp = gt_keypoints[ind_kp, 2, :] > 0
    is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0
    kp_fg_inds = np.where(
        np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible)
    )[0]

    kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size)
    if kp_fg_inds.size > kp_fg_rois_per_this_image:
        kp_fg_inds = np.random.choice(
            kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False
        )

    if kp_fg_inds.shape[0] > 0:
        sampled_fg_rois = roidb['boxes'][kp_fg_inds]
        box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds]

        num_keypoints = gt_keypoints.shape[2]
        sampled_keypoints = -np.ones(
            (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints),
            dtype=gt_keypoints.dtype
        )
        for ii in range(len(sampled_fg_rois)):
            ind = box_to_gt_ind_map[ii]
            if ind >= 0:
                sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :]
                assert np.sum(sampled_keypoints[ii, 2, :]) > 0

        heats, weights = keypoint_utils.keypoints_to_sigmoid_heatmap_labels(
            sampled_keypoints, sampled_fg_rois, M=cfg.KRCNN.HEATMAP_SIZE
        )

        shape = sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS
        heats = heats.reshape((shape, M**2))
        weights = weights.reshape((shape, 1))

    else:# If there are no fg keypoint rois (it does happen)
        # The network cannot handle empty blobs, so we must provide a heatmap
        # We simply take the first bg roi, given it an all zero heatmap, and
        # set its weights to zero (ignore label).
        roi_inds = np.where(roidb['gt_classes'] == 0)[0]
        # sampled_fg_rois is actually one random roi, but that's ok because ...
        sampled_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1))
        # We give it an 0's blob 
        heats = (-1) * blob_utils.ones((1 * cfg.KRCNN.NUM_KEYPOINTS, M**2))
        # We set weights to 0 (ignore label)
        weights = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1))

    sampled_fg_rois *= im_scale
    repeated_batch_idx = batch_idx * blob_utils.ones(
        (sampled_fg_rois.shape[0], 1)
    )
    sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois))

    blobs['keypoint_rois'] = sampled_fg_rois
    blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False)
    blobs['keypoint_weights'] = weights

    # Since in this function we may random sample a subset of bbox as the roi, 
    # we need to make sure it's the same subset for the refined_keypoint_rois,
    # so we pass out the inds for the subset too. 
    blobs['keypoint_fg_inds'] = kp_fg_inds.astype(np.int32, copy=False)
    def forward(self,
                frame_feat=None,
                obj_feat=None,
                human_mask=None,
                human_box=None,
                roidb=None,
                roi=None,
                batch=1,
                no_dropout=False,
                full_batch=False,
                binary_label=None):
        device_id = obj_feat.get_device()

        B = frame_feat.shape[0]

        obj_label = torch.IntTensor([db['obj_gt_cls']
                                     for db in roidb]).cuda(device_id)
        prd_label = torch.IntTensor([db['prd_gt_cls']
                                     for db in roidb]).cuda(device_id)

        # -------------------------------------------------------------------------------------------------------------------
        # obj visual and human visual
        # -------------------------------------------------------------------------------------------------------------------
        obj_inter = self.obj_feats(obj_feat)
        obj_hidden = self.obj_feats_2(obj_inter)
        obj_hidden_norm = F.normalize(obj_hidden, p=2, dim=1)

        densepose_mask = human_mask
        densepose_mask = densepose_mask.view(-1, 1, densepose_mask.shape[1],
                                             densepose_mask.shape[2])
        densepose_mask_conv = self.human_mask_conv(densepose_mask)

        roi_feature = RoIAlignFunction(
            7, 7, 1. / 8, 0.0)(densepose_mask_conv,
                               Variable(torch.from_numpy(roi)).cuda(device_id))
        roi_feature = roi_feature.view(-1, 64 * 49)
        densepose_mask_hidden = self.human_mask_feats(roi_feature)
        densepose_mask_hidden_norm = F.normalize(densepose_mask_hidden,
                                                 p=2,
                                                 dim=1)

        ## ----------------------------------------------------------------------------------------------------------------------
        ## extract the features of densepose bounding boxes
        ## ----------------------------------------------------------------------------------------------------------------------
        human_boxs = []
        for batch_idx, tem_human_box in enumerate(human_box):
            repeated_batch_idx = batch_idx * blob_utils.ones(
                (tem_human_box.shape[0], 1))
            tem_human_box = np.hstack((repeated_batch_idx, tem_human_box[:,
                                                                         1:]))
            for tem in tem_human_box:
                human_boxs.append(tem)
        densepose_roi = np.array(human_boxs)

        densepose_roi_feature = RoIAlignFunction(7, 7, 1. / 8, 0.0)(
            densepose_mask_conv,
            Variable(torch.from_numpy(densepose_roi)).cuda(device_id))
        densepose_roi_feature = densepose_roi_feature.view(-1, 64 * 49)
        densepose_box_feature_hidden = self.human_mask_feats(
            densepose_roi_feature)
        densepose_box_feature_hidden_norm = F.normalize(
            densepose_box_feature_hidden, p=2, dim=1)

        # -------------------------------------------------------------------------------------------------------------------
        # obj text and prd text
        # -------------------------------------------------------------------------------------------------------------------
        ## obj text
        obj_text_vecs = self.obj_vecs[obj_label]
        obj_text_vecs = Variable(
            torch.from_numpy(obj_text_vecs.astype('float32'))).cuda(device_id)
        if obj_text_vecs.dim() == 1:
            obj_text_vecs = obj_text_vecs.view(1, -1)

        obj_text_hidden = self.obj_text_feats(obj_text_vecs)
        obj_text_hidden_norm = F.normalize(obj_text_hidden, p=2,
                                           dim=1)  # (#prd, 1024)

        ## prd text
        prd_text_vecs = self.prd_vecs[prd_label]
        prd_text_vecs = Variable(
            torch.from_numpy(prd_text_vecs.astype('float32'))).cuda(device_id)
        if prd_text_vecs.dim() == 1:
            prd_text_vecs = prd_text_vecs.view(1, -1)

        prd_text_hidden = self.prd_text_feats(prd_text_vecs)
        prd_text_hidden_norm = F.normalize(prd_text_hidden, p=2, dim=1)

        # -------------------------------------------------------------------------------------------------------------------
        # video binary loss, text match video
        # -------------------------------------------------------------------------------------------------------------------
        if cfg.BINARY_LOSS:
            frame_feat_binary = self.frame_fc_binary(frame_feat)
            frame_feat_binary = torch.cat(
                [frame_feat_binary, obj_text_hidden, prd_text_hidden], dim=-1)
            frame_feat_binary = self.frame_fc_cat_binary(frame_feat_binary)
            frame_feat_binary = frame_feat_binary.mean(0)
            frame_feat_binary_pred = self.binary_classifier(frame_feat_binary)
            frame_feat_binary_pred = frame_feat_binary_pred.view(1, -1)
            video_binary_loss = F.cross_entropy(frame_feat_binary_pred,
                                                binary_label)
            video_binary_loss = 10 * video_binary_loss
        else:
            video_binary_loss = torch.tensor([0]).cuda(device_id)

        # -------------------------------------------------------------------------------------------------------------------
        # concate visual obj + obj text + prd text --> weight
        # -------------------------------------------------------------------------------------------------------------------
        if roi is None:
            obj_text_hidden_norm_expn = obj_text_hidden_norm.expand(
                obj_hidden_norm.shape[0], obj_text_hidden_norm.shape[1])
            prd_text_hidden_norm_expn = prd_text_hidden_norm.expand(
                obj_hidden_norm.shape[0], prd_text_hidden_norm.shape[1])

            densepose_mask_hidden_norm_expn = densepose_mask_hidden_norm.expand(
                obj_hidden_norm.shape[0], densepose_mask_hidden_norm.shape[1])
            densepose_box_feature_hidden_norm_expn = densepose_box_feature_hidden_norm.expand(
                densepose_box_feature_hidden_norm.shape[0],
                densepose_box_feature_hidden_norm.shape[1])
        else:
            gather_obj_index = torch.Tensor(roi[:, 0:1]).long().repeat(
                1, obj_text_hidden_norm.shape[1]).cuda(device_id)
            gather_prd_index = torch.Tensor(roi[:, 0:1]).long().repeat(
                1, prd_text_hidden_norm.shape[1]).cuda(device_id)
            obj_text_hidden_norm_expn = torch.gather(obj_text_hidden_norm, 0,
                                                     gather_obj_index)
            prd_text_hidden_norm_expn = torch.gather(prd_text_hidden_norm, 0,
                                                     gather_prd_index)

            gather_obj_index = torch.Tensor(
                densepose_roi[:, 0:1]).long().repeat(
                    1, obj_text_hidden_norm.shape[1]).cuda(device_id)
            gather_prd_index = torch.Tensor(
                densepose_roi[:, 0:1]).long().repeat(
                    1, prd_text_hidden_norm.shape[1]).cuda(device_id)
            densepose_obj_text_hidden_norm_expn = torch.gather(
                obj_text_hidden_norm, 0, gather_obj_index)
            densepose_prd_text_hidden_norm_expn = torch.gather(
                prd_text_hidden_norm, 0, gather_prd_index)

        if cfg.HUMAN_OBJ_SPATIAL:
            obj_hidden_norm = torch.max(obj_hidden_norm,
                                        densepose_mask_hidden_norm)
            obj_hidden = torch.max(obj_hidden, densepose_mask_hidden)

        # -------------------------------------------------------------------------------------------------------------------
        # COM_WEIGHT
        # -------------------------------------------------------------------------------------------------------------------
        if cfg.COM_WEIGHT == 'cat_video_soft_attention':
            frame_feat = self.frame_fc(frame_feat)

            query = self.soft_attention_fc2(
                F.relu(self.soft_attention_fc1(frame_feat)))
            key = self.soft_attention_fc4(
                F.relu(self.soft_attention_fc3(frame_feat)))
            sim = query[:, None, :] * key[None, :, :]
            sim = F.softmax(sim.sum(dim=-1), dim=-1)
            value = self.soft_attention_fc6(
                F.relu(self.soft_attention_fc5(frame_feat)))
            frame_feat = (sim[:, :, None] *
                          value[:, None, :].repeat(1, sim.size(1), 1)).sum(
                              dim=1)
            frame_feat_norm = F.normalize(frame_feat, p=2, dim=1)

            gather_index = torch.Tensor(roi[:, 0:1]).long().repeat(
                1, frame_feat_norm.shape[1]).cuda(device_id)
            frame_feat_norm_obj = torch.gather(frame_feat_norm, 0,
                                               gather_index)
            gather_index = torch.Tensor(densepose_roi[:, 0:1]).long().repeat(
                1, frame_feat_norm.shape[1]).cuda(device_id)
            frame_feat_norm_human = torch.gather(frame_feat_norm, 0,
                                                 gather_index)

            concated_vecs = torch.cat(
                (obj_hidden_norm, obj_text_hidden_norm_expn,
                 prd_text_hidden_norm_expn, frame_feat_norm_obj),
                dim=1)
            densepose_concated_vecs = torch.cat(
                (densepose_box_feature_hidden_norm,
                 densepose_obj_text_hidden_norm_expn,
                 densepose_prd_text_hidden_norm_expn, frame_feat_norm_human),
                dim=1)

            roi_weights = self.roi_weights_net_obj(concated_vecs)
            roi_weights_human = self.roi_weights_net_human(
                densepose_concated_vecs)

        # -------------------------------------------------------------------------------------------------------------------
        # -------------------------------------------------------------------------------------------------------------------
        if roi is None:
            roi_weights = F.softmax(roi_weights, dim=0)
            roi_weights = roi_weights.view(1, -1)
            obj_hidden_weighted = torch.mm(roi_weights, obj_hidden)

            roi_weights_human = F.softmax(roi_weights_human, dim=0)
            roi_weights_human = roi_weights_human.view(1, -1)
            densepose_box_feature_hidden_weighted = torch.mm(
                roi_weights_human, densepose_box_feature_hidden)
        else:
            roi_weights_unpacked = roi_weights.view(
                -1, cfg.TRAIN.BATCH_SIZE_PER_IM, 1)
            roi_weights_human_unpacked = roi_weights_human.view(
                -1, cfg.MAX_NUM_HUMAN, 1)

            roi_weights_ori = roi_weights.view(-1, cfg.TRAIN.BATCH_SIZE_PER_IM)
            roi_weights_human_ori = roi_weights_human.view(
                -1, cfg.MAX_NUM_HUMAN)

            if not no_dropout:
                roi_weights_unpacked = self.dropout(roi_weights_unpacked)
                roi_weights_human_unpacked = self.dropout(
                    roi_weights_human_unpacked)

            roi_weights_unpacked = F.softmax(roi_weights_unpacked, dim=1)
            roi_weights_human_unpacked = F.softmax(roi_weights_human_unpacked,
                                                   dim=1)

            ## ---------------------------------------------------------------------------------------------------------
            ## feature
            ## ---------------------------------------------------------------------------------------------------------
            obj_hidden_unpacked = obj_hidden.view(-1,
                                                  cfg.TRAIN.BATCH_SIZE_PER_IM,
                                                  obj_hidden.size(1))
            densepose_box_feature_hidden_unpacked = densepose_box_feature_hidden.view(
                -1, cfg.MAX_NUM_HUMAN, densepose_box_feature_hidden.size(1))

            obj_hidden_weighted = torch.sum(roi_weights_unpacked *
                                            obj_hidden_unpacked,
                                            dim=1)
            obj_hidden_human_weighted = torch.sum(
                roi_weights_human_unpacked *
                densepose_box_feature_hidden_unpacked,
                dim=1)

            if cfg.VIDEO_LOSS == 'contrastive_max_plus':
                obj_feat = obj_hidden
                obj_hidden_video_unpacked = obj_feat.view(
                    -1, cfg.VIDEO_FRAME, cfg.TRAIN.BATCH_SIZE_PER_IM,
                    obj_feat.shape[1])
                # obj_hidden_video_unpacked = F.normalize(obj_hidden_video_unpacked, p=2, dim=3) * 4
                roi_weights_unpacked_batch = roi_weights_unpacked.view(
                    -1, cfg.VIDEO_FRAME, cfg.TRAIN.BATCH_SIZE_PER_IM, 1)
                # idx = torch.max(roi_weights_unpacked_batch, dim=2)[1][:, :, None, :]
                sort_idx = torch.sort(roi_weights_unpacked_batch, dim=2)[1]
                idx_select = sort_idx[:, :,
                                      -1:].repeat(1, 1, 1, obj_feat.shape[1])
                anchor_embed = torch.gather(obj_hidden_video_unpacked, 2,
                                            idx_select)

                # Randomly sample a positive pair of frames for positive samples
                permute = torch.randperm(cfg.VIDEO_FRAME).cuda(device_id)
                pos_embed = anchor_embed[:, permute]

                permute = torch.randperm(
                    cfg.TRAIN.BATCH_SIZE_PER_IM).cuda(device_id)
                obj_hidden_video_permute = obj_hidden_video_unpacked[:, :,
                                                                     permute]
                neg_sample = 15

                neg_embed = obj_hidden_video_unpacked[:, :, :neg_sample]

                pos_dot = (pos_embed * anchor_embed).sum(dim=3)
                neg_dot = (neg_embed * anchor_embed).sum(dim=3)

                neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1)
                pos_dot = -pos_dot
                video_loss = pos_dot.view(
                    -1, cfg.VIDEO_FRAME) + torch.logsumexp(-neg_dot, dim=-1)

                select_frames = max(int(cfg.VIDEO_FRAME * 0.7), 1)
                video_loss, _ = torch.sort(video_loss, dim=1)
                video_loss = cfg.VIDEO_WEIGHT * video_loss[:, :
                                                           select_frames].mean(
                                                           )
            else:
                video_loss = torch.zeros(1)[0].cuda(device_id)

        if cfg.OBJ_LOSS == 'contrastive_objloss':
            hidden_weighted_obj = obj_hidden_weighted
            hidden_weighted_prd = obj_hidden_human_weighted

            nsample = 15
            word_embed_contrast_obj = self.word_embed_contrast_obj(
                obj_text_vecs)
            word_embed_contrast_prd = self.word_embed_contrast_prd(
                prd_text_vecs)

            ## neg obj
            n_obj = self.obj_vecs.shape[0]
            neg_sample = np.random.choice(np.arange(n_obj, dtype=np.int32),
                                          size=(obj_text_vecs.shape[0] *
                                                nsample, ))
            neg_embed_obj = self.obj_vecs[neg_sample]
            neg_embed_obj = neg_embed_obj.reshape(
                (int(obj_text_vecs.shape[0]), nsample, 300))
            neg_embed_obj = Variable(
                torch.from_numpy(
                    neg_embed_obj.astype('float32'))).cuda(device_id)

            ## neg prd
            n_prd = self.prd_vecs.shape[0]
            neg_sample = np.random.choice(np.arange(n_prd, dtype=np.int32),
                                          size=(prd_text_vecs.shape[0] *
                                                nsample, ))
            neg_embed_prd = self.prd_vecs[neg_sample]
            neg_embed_prd = neg_embed_prd.reshape(
                (int(prd_text_vecs.shape[0]), nsample, 300))
            neg_embed_prd = Variable(
                torch.from_numpy(
                    neg_embed_prd.astype('float32'))).cuda(device_id)

            ## embed neg obj and prd
            neg_embed_contrast_obj = self.word_embed_contrast_obj(
                neg_embed_obj)
            neg_embed_contrast_prd = self.word_embed_contrast_prd(
                neg_embed_prd)

            pos_dot = (hidden_weighted_obj[:, None, :] *
                       word_embed_contrast_obj[:, None, :]).sum(dim=2)
            neg_dot = (hidden_weighted_obj[:, None, :] *
                       neg_embed_contrast_obj).sum(dim=2)
            neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1)
            pos_dot = -pos_dot
            obj_loss = pos_dot.view(-1, cfg.VIDEO_FRAME) + torch.logsumexp(
                -neg_dot, dim=-1).view(-1, cfg.VIDEO_FRAME)

            pos_dot = (hidden_weighted_prd[:, None, :] *
                       word_embed_contrast_prd[:, None, :]).sum(dim=2)
            neg_dot = (hidden_weighted_prd[:, None, :] *
                       neg_embed_contrast_prd).sum(dim=2)
            neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1)
            pos_dot = -pos_dot
            prd_loss = pos_dot.view(-1, cfg.VIDEO_FRAME) + torch.logsumexp(
                -neg_dot, dim=-1).view(-1, cfg.VIDEO_FRAME)

            # select_frames = max(int(cfg.VIDEO_FRAME * 0.5), 1)
            select_frames = cfg.VIDEO_FRAME

            obj_loss, _ = torch.sort(obj_loss, dim=-1)
            obj_loss = obj_loss[:, :select_frames]
            # obj_loss = torch.clamp(obj_loss, 0, 1e5)
            obj_loss = obj_loss.mean()
            obj_scores = None

            prd_loss, _ = torch.sort(prd_loss, dim=-1)
            prd_loss = prd_loss[:, :select_frames]
            prd_loss = prd_loss.mean()

        if cfg.WEIGHT_REG == 'L2':
            weight_loss = torch.norm(roi_weights_unpacked, 2.0, 1)
            weight_loss = -cfg.L2_WEIGHT * torch.log(weight_loss.mean())

            weight_human_loss = torch.norm(roi_weights_human_unpacked, 2.0, 1)
            weight_human_loss = -cfg.L2_WEIGHT * torch.log(
                weight_human_loss.mean())

        cls_prediction = {}
        if not self.training and cfg.BINARY_LOSS:
            cls_prediction['binary_pred'] = frame_feat_binary_pred

        if cfg.BINARY_LOSS:
            loss_scale = F.softmax(frame_feat_binary_pred)[0][1]
            obj_loss = obj_loss * loss_scale
            prd_loss = prd_loss * loss_scale
            video_loss = video_loss
            weight_loss = weight_loss
            weight_human_loss = weight_human_loss

        return obj_loss, prd_loss, weight_loss, weight_human_loss, video_loss, video_binary_loss, roi_weights_unpacked, roi_weights_human_unpacked, densepose_roi, roi_weights_ori, roi_weights_human_ori, cls_prediction