def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks): """Generate targets for training Stage 2 classifier and mask heads. This is not used in normal training. It's useful for debugging or to train the Mask RCNN heads without using the RPN head. Inputs: rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes. gt_class_ids: [instance count] Integer class IDs gt_boxes: [instance count, (y1, x1, y2, x2)] gt_masks: [height, width, instance count] Ground truth masks. Can be full size or mini-masks. Returns: rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific bbox refinements. masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped to bbox boundaries and resized to neural network output size. """ assert rpn_rois.shape[0] > 0 assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format( gt_class_ids.dtype) assert gt_boxes.dtype == np.int32, "Expected int but got {}".format( gt_boxes.dtype) assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format( gt_masks.dtype) instance_ids = np.where(gt_class_ids > 0)[0] assert instance_ids.shape[0] > 0, "Image must contain instances." gt_class_ids = gt_class_ids[instance_ids] gt_boxes = gt_boxes[instance_ids] gt_masks = gt_masks[:, :, instance_ids] rpn_rois_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1]) gt_boxes_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1]) overlaps = np.zeros([rpn_rois.shape[0], gt_boxes.shape[0]]) for i in range(gt_boxes.shape[0]): box = gt_boxes[i] overlaps[:, i] = utils.compute_iou(box, rpn_rois, gt_boxes_area[i], rpn_rois_area) rpn_rois_iou_argmax = np.argmax(overlaps, axis=1) rpn_rois_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_rois_iou_argmax] rpn_roi_gt_boxes = gt_boxes[rpn_rois_iou_argmax] rpn_roi_gt_class_ids = gt_class_ids[rpn_rois_iou_argmax] fg_ids = np.where(rpn_rois_iou_max > 0.5)[0] bg_ids = np.where(rpn_rois_iou_max < 0.5)[0] fg_count = int(hyper_parameters.FLAGS.ROI_POSITIVE_RATIO * hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE) if fg_ids.shape[0] > fg_count: keep_fg_ids = np.random.choice(fg_ids, fg_count, replace=False) else: keep_fg_ids = fg_ids remaining = hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[ 0] if bg_ids.shape[0] > remaining: keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) else: keep_bg_ids = bg_ids keep = np.concatenate([keep_fg_ids, keep_bg_ids]) remaining = hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE - keep.shape[0] if remaining > 0: keep_extra_ids = np.random.choice(keep_bg_ids, remaining, replace=True) keep = np.concatenate([keep, keep_extra_ids]) assert keep.shape[0] == hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE, \ "keep doesn't match ROI batch size {}, {}".format( keep.shape[0], hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE) rpn_roi_gt_boxes[keep_bg_ids, :] = 0 rpn_roi_gt_class_ids[keep_bg_ids] = 0 rois = rpn_rois[keep] roi_gt_boxes = rpn_roi_gt_boxes[keep] roi_gt_class_ids = rpn_roi_gt_class_ids[keep] roi_gt_assignment = rpn_rois_iou_argmax[keep] bboxes = np.zeros([ hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE, hyper_parameters.FLAGS.NUM_CLASSES, 4 ], dtype=np.float32) pos_ids = np.where(roi_gt_class_ids > 0)[0] bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement( rois[pos_ids], roi_gt_boxes[pos_ids][:4]) bboxes /= hyper_parameters.FLAGS.BBOX_STD_DEV masks = np.zeros((hyper_parameters.FLAGS.TRAIN_ROIS_PER_IMAGE, hyper_parameters.FLAGS.MASK_SHAPE[0], hyper_parameters.FLAGS.MASK_SHAPE[1], hyper_parameters.FLAGS.NUM_CLASSES), dtype=np.float32) for i in pos_ids: class_id = roi_gt_class_ids[i] assert class_id > 0, "class id must be greater than 0" assert isinstance(i, int) gt_id = roi_gt_assignment[i] class_mask = gt_masks[:, :, gt_id] if hyper_parameters.FLAGS.USE_MINI_MASK: # Create a mask placeholder, the size of the image placeholder = np.zeros(hyper_parameters.FLAGS.IMAGE_SHAPE[:2], dtype=bool) # GT box gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id] gt_w = gt_x2 - gt_x1 gt_h = gt_y2 - gt_y1 # Resize mini mask to size of GT box placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \ np.round(utils.resize(class_mask, (gt_h, gt_w))).astype(bool) # Place the mini batch in the placeholder class_mask = placeholder # Pick part of the mask and resize it y1, x1, y2, x2 = rois[i].astype(np.int32) m = class_mask[y1:y2, x1:x2] mask = utils.resize(m, hyper_parameters.FLAGS.MASK_SHAPE) masks[i, :, :, class_id] = mask return rois, roi_gt_class_ids, bboxes, masks
def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config): """Generate targets for training Stage 2 classifier and mask heads. This is not used in normal training. It's useful for debugging or to train the Mask RCNN heads without using the RPN head. Inputs: rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes. gt_class_ids: [instance count] Integer class IDs gt_boxes: [instance count, (y1, x1, y2, x2)] gt_masks: [height, width, instance count] Grund truth masks. Can be full size or mini-masks. Returns: rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific bbox refinements. masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped to bbox boundaries and resized to neural network output size. """ assert rpn_rois.shape[0] > 0 assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format( gt_class_ids.dtype) assert gt_boxes.dtype == np.int32, "Expected int but got {}".format( gt_boxes.dtype) assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format( gt_masks.dtype) # It's common to add GT Boxes to ROIs but we don't do that here because # according to XinLei Chen's paper, it doesn't help. # Trim empty padding in gt_boxes and gt_masks parts instance_ids = np.where(gt_class_ids > 0)[0] assert instance_ids.shape[0] > 0, "Image must contain instances." gt_class_ids = gt_class_ids[instance_ids] gt_boxes = gt_boxes[instance_ids] gt_masks = gt_masks[:, :, instance_ids] # Compute areas of ROIs and ground truth boxes. rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \ (rpn_rois[:, 3] - rpn_rois[:, 1]) gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \ (gt_boxes[:, 3] - gt_boxes[:, 1]) # Compute overlaps [rpn_rois, gt_boxes] overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0])) for i in range(overlaps.shape[1]): gt = gt_boxes[i] overlaps[:, i] = utils.compute_iou(gt, rpn_rois, gt_box_area[i], rpn_roi_area) # Assign ROIs to GT boxes rpn_roi_iou_argmax = np.argmax(overlaps, axis=1) rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_roi_iou_argmax] # GT box assigned to each ROI rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax] rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax] # Positive ROIs are those with >= 0.5 IoU with a GT box. fg_ids = np.where(rpn_roi_iou_max > 0.5)[0] # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining) # TODO: To hard example mine or not to hard example mine, that's the question # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0] bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] # Subsample ROIs. Aim for 33% foreground. # FG fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) if fg_ids.shape[0] > fg_roi_count: keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False) else: keep_fg_ids = fg_ids # BG remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0] if bg_ids.shape[0] > remaining: keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) else: keep_bg_ids = bg_ids # Combine indicies of ROIs to keep keep = np.concatenate([keep_fg_ids, keep_bg_ids]) # Need more? remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0] if remaining > 0: # Looks like we don't have enough samples to maintain the desired # balance. Reduce requirements and fill in the rest. This is # likely different from the Mask RCNN paper. # There is a small chance we have neither fg nor bg samples. if keep.shape[0] == 0: # Pick bg regions with easier IoU threshold bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] assert bg_ids.shape[0] >= remaining keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) assert keep_bg_ids.shape[0] == remaining keep = np.concatenate([keep, keep_bg_ids]) else: # Fill the rest with repeated bg rois. keep_extra_ids = np.random.choice(keep_bg_ids, remaining, replace=True) keep = np.concatenate([keep, keep_extra_ids]) assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE, \ "keep doesn't match ROI batch size {}, {}".format( keep.shape[0], config.TRAIN_ROIS_PER_IMAGE) # Reset the gt boxes assigned to BG ROIs. rpn_roi_gt_boxes[keep_bg_ids, :] = 0 rpn_roi_gt_class_ids[keep_bg_ids] = 0 # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement. rois = rpn_rois[keep] roi_gt_boxes = rpn_roi_gt_boxes[keep] roi_gt_class_ids = rpn_roi_gt_class_ids[keep] roi_gt_assignment = rpn_roi_iou_argmax[keep] # Class-aware bbox deltas. [y, x, log(h), log(w)] bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.NUM_CLASSES, 4), dtype=np.float32) pos_ids = np.where(roi_gt_class_ids > 0)[0] bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement( rois[pos_ids], roi_gt_boxes[pos_ids, :4]) # Normalize bbox refinements bboxes /= config.BBOX_STD_DEV # Generate class-specific target masks. masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES), dtype=np.float32) for i in pos_ids: class_id = roi_gt_class_ids[i] assert class_id > 0, "class id must be greater than 0" gt_id = roi_gt_assignment[i] class_mask = gt_masks[:, :, gt_id] if config.USE_MINI_MASK: # Create a mask placeholder, the size of the image placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool) # GT box gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id] gt_w = gt_x2 - gt_x1 gt_h = gt_y2 - gt_y1 # Resize mini mask to size of GT box placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \ np.round(scipy.misc.imresize(class_mask.astype(float), (gt_h, gt_w), interp='nearest') / 255.0).astype(bool) # Place the mini batch in the placeholder class_mask = placeholder # Pick part of the mask and resize it y1, x1, y2, x2 = rois[i].astype(np.int32) m = class_mask[y1:y2, x1:x2] mask = scipy.misc.imresize( m.astype(float), config.MASK_SHAPE, interp='nearest') / 255.0 masks[i, :, :, class_id] = mask return rois, roi_gt_class_ids, bboxes, masks
def detection_target_layer(proposals, gt_class_ids, gt_boxes, gt_masks, config): """Subsamples proposals and generates target box refinment, class_ids, and masks for each. Inputs: proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals. gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs. gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks. rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs. target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw), class_id)] Class-specific bbox refinments. target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width) Masks cropped to bbox boundaries and resized to neural network output size. """ # Currently only supports batchsize 1 proposals = proposals.squeeze(0) gt_class_ids = gt_class_ids.squeeze(0) gt_boxes = gt_boxes.squeeze(0) gt_masks = gt_masks.squeeze(0) # Handle COCO crowds # A crowd box in COCO is a bounding box around several instances. Exclude # them from training. A crowd box is given a negative class ID. if torch.nonzero(gt_class_ids < 0).size(): # test_data = gt_class_ids # print(test_data.size()) crowd_ix = torch.nonzero(gt_class_ids < 0)[:, 0] non_crowd_ix = torch.nonzero(gt_class_ids > 0)[:, 0] crowd_boxes = gt_boxes[crowd_ix.data, :] crowd_masks = gt_masks[crowd_ix.data, :, :] gt_class_ids = gt_class_ids[non_crowd_ix.data] gt_boxes = gt_boxes[non_crowd_ix.data, :] gt_masks = gt_masks[non_crowd_ix.data, :] # Compute overlaps with crowd boxes [anchors, crowds] crowd_overlaps = bbox_overlaps(proposals, crowd_boxes) crowd_iou_max = torch.max(crowd_overlaps, dim=1)[0] no_crowd_bool = crowd_iou_max < 0.001 else: no_crowd_bool = Variable(torch.ByteTensor(proposals.size()[0] * [True]), requires_grad=False) if config.GPU_COUNT: no_crowd_bool = no_crowd_bool.cuda() # Compute overlaps matrix [proposals, gt_boxes] overlaps = bbox_overlaps(proposals, gt_boxes) # Determine postive and negative ROIs roi_iou_max = torch.max(overlaps, dim=1)[0] # 1. Positive ROIs are those with >= 0.5 IoU with a GT box positive_roi_bool = roi_iou_max >= 0.5 # Subsample ROIs. Aim for 33% positive # Positive ROIs if torch.nonzero(positive_roi_bool).size(): positive_indices = torch.nonzero(positive_roi_bool)[:, 0] positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) rand_idx = torch.randperm(positive_indices.size()[0]) rand_idx = rand_idx[:positive_count] if config.GPU_COUNT: rand_idx = rand_idx.cuda() positive_indices = positive_indices[rand_idx] positive_count = positive_indices.size()[0] positive_rois = proposals[positive_indices.data, :] # Assign positive ROIs to GT boxes. positive_overlaps = overlaps[positive_indices.data, :] roi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1] roi_gt_boxes = gt_boxes[roi_gt_box_assignment.data, :] roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment.data] # Compute bbox refinement for positive ROIs deltas = Variable(utils.box_refinement(positive_rois.data, roi_gt_boxes.data), requires_grad=False) std_dev = Variable(torch.from_numpy(config.BBOX_STD_DEV).float(), requires_grad=False) if config.GPU_COUNT: std_dev = std_dev.cuda() deltas /= std_dev # Assign positive ROIs to GT masks roi_masks = gt_masks[roi_gt_box_assignment.data, :, :] # Compute mask targets boxes = positive_rois if config.USE_MINI_MASK: # Transform ROI corrdinates from normalized image space # to normalized mini-mask space. y1, x1, y2, x2 = positive_rois.chunk(4, dim=1) gt_y1, gt_x1, gt_y2, gt_x2 = roi_gt_boxes.chunk(4, dim=1) gt_h = gt_y2 - gt_y1 gt_w = gt_x2 - gt_x1 y1 = (y1 - gt_y1) / gt_h x1 = (x1 - gt_x1) / gt_w y2 = (y2 - gt_y1) / gt_h x2 = (x2 - gt_x1) / gt_w boxes = torch.cat([y1, x1, y2, x2], dim=1) box_ids = Variable(torch.arange(roi_masks.size()[0]), requires_grad=False).int() if config.GPU_COUNT: box_ids = box_ids.cuda() masks = Variable(CropAndResizeFunction(config.MASK_SHAPE[0], config.MASK_SHAPE[1], 0)(roi_masks.unsqueeze(1), boxes, box_ids).data, requires_grad=False) masks = masks.squeeze(1) # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with # binary cross entropy loss. masks = torch.round(masks) else: positive_count = 0 # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds. negative_roi_bool = roi_iou_max < 0.5 negative_roi_bool = negative_roi_bool & no_crowd_bool # Negative ROIs. Add enough to maintain positive:negative ratio. if torch.nonzero(negative_roi_bool).size() and positive_count > 0: negative_indices = torch.nonzero(negative_roi_bool)[:, 0] r = 1.0 / config.ROI_POSITIVE_RATIO negative_count = int(r * positive_count - positive_count) rand_idx = torch.randperm(negative_indices.size()[0]) rand_idx = rand_idx[:negative_count] if config.GPU_COUNT: rand_idx = rand_idx.cuda() negative_indices = negative_indices[rand_idx] negative_count = negative_indices.size()[0] negative_rois = proposals[negative_indices.data, :] else: negative_count = 0 # Append negative ROIs and pad bbox deltas and masks that # are not used for negative ROIs with zeros. if positive_count > 0 and negative_count > 0: rois = torch.cat((positive_rois, negative_rois), dim=0) zeros = Variable(torch.zeros(negative_count), requires_grad=False).int() if config.GPU_COUNT: zeros = zeros.cuda() roi_gt_class_ids = torch.cat([roi_gt_class_ids, zeros], dim=0) zeros = Variable(torch.zeros(negative_count, 4), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() deltas = torch.cat([deltas, zeros], dim=0) zeros = Variable(torch.zeros(negative_count, config.MASK_SHAPE[0], config.MASK_SHAPE[1]), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() masks = torch.cat([masks, zeros], dim=0) elif positive_count > 0: rois = positive_rois elif negative_count > 0: rois = negative_rois zeros = Variable(torch.zeros(negative_count), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() roi_gt_class_ids = zeros zeros = Variable(torch.zeros(negative_count, 4), requires_grad=False).int() if config.GPU_COUNT: zeros = zeros.cuda() deltas = zeros zeros = Variable(torch.zeros(negative_count, config.MASK_SHAPE[0], config.MASK_SHAPE[1]), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() masks = zeros else: rois = Variable(torch.FloatTensor(), requires_grad=False) roi_gt_class_ids = Variable(torch.IntTensor(), requires_grad=False) deltas = Variable(torch.FloatTensor(), requires_grad=False) masks = Variable(torch.FloatTensor(), requires_grad=False) if config.GPU_COUNT: rois = rois.cuda() roi_gt_class_ids = roi_gt_class_ids.cuda() deltas = deltas.cuda() masks = masks.cuda() return rois, roi_gt_class_ids, deltas, masks