def get_location_info(human_boxes, object_boxes, union_boxes): assert human_boxes.shape[1] == object_boxes.shape[1] == union_boxes.shape[ 1] == 4 human_object_loc = box_utils.bbox_transform_inv(human_boxes, object_boxes) human_union_loc = box_utils.bbox_transform_inv(human_boxes, union_boxes) object_union_loc = box_utils.bbox_transform_inv(object_boxes, union_boxes) return np.concatenate( (human_object_loc, human_union_loc, object_union_loc), axis=1)
def test_bbox_dataset_to_prediction_roundtrip(self): """Simulate the process of reading a ground-truth box from a dataset, make predictions from proposals, convert the predictions back to the dataset format, and then use the COCO API to compute IoU overlap between the gt box and the predictions. These should have IoU of 1. """ weights = (5, 5, 10, 10) # 1/ "read" a box from a dataset in the default (x1, y1, w, h) format gt_xywh_box = [10, 20, 100, 150] # 2/ convert it to our internal (x1, y1, x2, y2) format gt_xyxy_box = box_utils.xywh_to_xyxy(gt_xywh_box) # 3/ consider nearby proposal boxes prop_xyxy_boxes = random_boxes(gt_xyxy_box, 10, 10) # 4/ compute proposal-to-gt transformation deltas deltas = box_utils.bbox_transform_inv( prop_xyxy_boxes, np.array([gt_xyxy_box]), weights=weights ) # 5/ use deltas to transform proposals to xyxy predicted box pred_xyxy_boxes = box_utils.bbox_transform( prop_xyxy_boxes, deltas, weights=weights ) # 6/ convert xyxy predicted box to xywh predicted box pred_xywh_boxes = box_utils.xyxy_to_xywh(pred_xyxy_boxes) # 7/ use COCO API to compute IoU not_crowd = [int(False)] * pred_xywh_boxes.shape[0] ious = COCOmask.iou(pred_xywh_boxes, np.array([gt_xywh_box]), not_crowd) np.testing.assert_array_almost_equal(ious, np.ones(ious.shape))
def _compute_targets(entry): """Compute bounding-box regression targets for an image.""" # Indices of ground-truth ROIs rois = entry['boxes'] overlaps = entry['max_overlaps'] labels = entry['max_classes'] gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] # Targets has format (class, tx, ty, tw, th) targets = np.zeros((rois.shape[0], 5), dtype=np.float32) if len(gt_inds) == 0: # Bail if the image has no ground-truth ROIs return targets # Indices of examples for which we try to make predictions ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] # Get IoU overlap between each ex ROI and gt ROI ex_gt_overlaps = box_utils.bbox_overlaps( rois[ex_inds, :].astype(dtype=np.float32, copy=False), rois[gt_inds, :].astype(dtype=np.float32, copy=False)) # Find which gt ROI each ex ROI has max overlap with: # this will be the ex ROI's gt target gt_assignment = ex_gt_overlaps.argmax(axis=1) gt_rois = rois[gt_inds[gt_assignment], :] ex_rois = rois[ex_inds, :] # Use class "1" for all boxes if using class_agnostic_bbox_reg targets[ex_inds, 0] = ( 1 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else labels[ex_inds]) targets[ex_inds, 1:] = box_utils.bbox_transform_inv( ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return targets
def _compute_targets(entry): """Compute bounding-box regression targets for an image.""" # Indices of ground-truth ROIs rois = entry['boxes'] overlaps = entry['max_overlaps'] labels = entry['max_classes'] gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] # Targets has format (class, tx, ty, tw, th) targets = np.zeros((rois.shape[0], 5), dtype=np.float32) if len(gt_inds) == 0: # Bail if the image has no ground-truth ROIs return targets # Indices of examples for which we try to make predictions ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] # Get IoU overlap between each ex ROI and gt ROI ex_gt_overlaps = box_utils.bbox_overlaps( rois[ex_inds, :].astype(dtype=np.float32, copy=False), rois[gt_inds, :].astype(dtype=np.float32, copy=False)) # Find which gt ROI each ex ROI has max overlap with: # this will be the ex ROI's gt target gt_assignment = ex_gt_overlaps.argmax(axis=1) gt_rois = rois[gt_inds[gt_assignment], :] ex_rois = rois[ex_inds, :] # Use class "1" for all boxes if using class_agnostic_bbox_reg targets[ex_inds, 0] = (1 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else labels[ex_inds]) targets[ex_inds, 1:] = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return targets
def test_bbox_dataset_to_prediction_roundtrip(self): """Simulate the process of reading a ground-truth box from a dataset, make predictions from proposals, convert the predictions back to the dataset format, and then use the COCO API to compute IoU overlap between the gt box and the predictions. These should have IoU of 1. """ weights = (5, 5, 10, 10) # 1/ "read" a box from a dataset in the default (x1, y1, w, h) format gt_xywh_box = [10, 20, 100, 150] # 2/ convert it to our internal (x1, y1, x2, y2) format gt_xyxy_box = box_utils.xywh_to_xyxy(gt_xywh_box) # 3/ consider nearby proposal boxes prop_xyxy_boxes = random_boxes(gt_xyxy_box, 10, 10) # 4/ compute proposal-to-gt transformation deltas deltas = box_utils.bbox_transform_inv(prop_xyxy_boxes, np.array([gt_xyxy_box]), weights=weights) # 5/ use deltas to transform proposals to xyxy predicted box pred_xyxy_boxes = box_utils.bbox_transform(prop_xyxy_boxes, deltas, weights=weights) # 6/ convert xyxy predicted box to xywh predicted box pred_xywh_boxes = box_utils.xyxy_to_xywh(pred_xyxy_boxes) # 7/ use COCO API to compute IoU not_crowd = [int(False)] * pred_xywh_boxes.shape[0] ious = COCOmask.iou(pred_xywh_boxes, np.array([gt_xywh_box]), not_crowd) np.testing.assert_array_almost_equal(ious, np.ones(ious.shape))
def _compute_action_targets(person_rois, gt_boxes, role_ids): ''' Compute action targets :param person_rois: rois assigned to gt acting-human, n * 4 :param gt_boxes: all gt boxes in one image :param role_ids: person_rois_num * action_cls_num * NUM_TARGET_OBJECT_TYPES, store person rois corresponding role object ids. :return: ''' assert person_rois.shape[0] == role_ids.shape[0] # ToDo: should use cfg.MODEL.BBOX_REG_WEIGHTS? # calculate targets between every person rois and every gt_boxes targets = box_utils.bbox_transform_inv( np.repeat(person_rois, gt_boxes.shape[0], axis=0), np.tile(gt_boxes, (person_rois.shape[0], 1)), (1., 1., 1., 1.)).reshape(person_rois.shape[0], gt_boxes.shape[0], -1) # human action targets is (person_num: 16, action_num: 26, role_cls: 2, relative_location: 4) human_action_targets = np.zeros( (role_ids.shape[0], role_ids.shape[1], role_ids.shape[2], 4), dtype=np.float32) action_target_weights = np.zeros_like(human_action_targets, dtype=np.float32) # get action targets relative location human_action_targets[np.where(role_ids > -1)] = \ targets[np.where(role_ids > -1)[0], role_ids[np.where(role_ids > -1)].astype(int)] action_target_weights[np.where(role_ids > -1)] = 1. return human_action_targets.reshape(-1, cfg.VCOCO.NUM_ACTION_CLASSES * cfg.VCOCO.NUM_TARGET_OBJECT_TYPES * 4), \ action_target_weights.reshape(-1, cfg.VCOCO.NUM_ACTION_CLASSES * cfg.VCOCO.NUM_TARGET_OBJECT_TYPES * 4)
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) # Use class "1" for all fg boxes if using class_agnostic_bbox_reg if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: labels.clip(max=1, out=labels) return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def test_bbox_transform_and_inverse(self): weights = (5, 5, 10, 10) src_boxes = random_boxes([10, 10, 20, 20], 1, 10) dst_boxes = random_boxes([10, 10, 20, 20], 1, 10) deltas = box_utils.bbox_transform_inv( src_boxes, dst_boxes, weights=weights ) dst_boxes_reconstructed = box_utils.bbox_transform( src_boxes, deltas, weights=weights ) np.testing.assert_array_almost_equal( dst_boxes, dst_boxes_reconstructed, decimal=5 )
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] # Following are no longer true with tubes. Also, since bbox_transform_inv # can handle tubes, we don't need these assertions # assert ex_rois.shape[1] == 4 # assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv( ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return np.hstack( (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def test_bbox_transform_and_inverse(self): weights = (5, 5, 10, 10) src_boxes = random_boxes([10, 10, 20, 20], 1, 10) dst_boxes = random_boxes([10, 10, 20, 20], 1, 10) deltas = box_utils.bbox_transform_inv(src_boxes, dst_boxes, weights=weights) dst_boxes_reconstructed = box_utils.bbox_transform(src_boxes, deltas, weights=weights) np.testing.assert_array_almost_equal(dst_boxes, dst_boxes_reconstructed, decimal=5)
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv( ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS ) return np.hstack((labels[:, np.newaxis], targets)).astype( np.float32, copy=False )
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] # Following are no longer true with tubes. Also, since bbox_transform_inv # can handle tubes, we don't need these assertions # assert ex_rois.shape[1] == 4 # assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) return np.hstack((labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def _compute_targets(ex_rois, gt_rois, labels): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 4 targets = box_utils.bbox_transform_inv(ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) # Use class "1" for all fg boxes if using class_agnostic_bbox_reg if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: labels.clip(max=1, out=labels) return np.hstack((labels[:, np.newaxis], targets)).astype( np.float32, copy=False)
def get_pair_feature(boxes1, boxes2): delta_1 = bbox_transform_inv(boxes1, boxes2) delta_2 = bbox_transform_inv(boxes2, boxes1) spt_feat = np.hstack((delta_1, delta_2[:, :2])) return spt_feat
def generate_triplets(rois, rois_human_inds, rois_object_inds, rois_to_gt_ind, gt_role_id, batch_idx): """ :param rois: :param rois_human_inds: human ind to rois index :param rois_object_inds: :param rois_to_gt_ind: rois index to gt box index :param gt_role_id: :param batch_idx: :return: """ # ToDo: cfg # ipdb.set_trace() triplets_num_per_image = cfg.VCOCO.TRIPLETS_NUM_PER_IM fg_triplets_num_per_image = int(triplets_num_per_image * cfg.VCOCO.FG_TRIPLETS_FRACTION) # label matrix gt_action_mat = generate_action_mat(gt_role_id) #N x N x 26 x 2 # generate combinations human_rois_inds, object_rois_inds = np.meshgrid( np.arange(rois_human_inds.size), np.arange(rois_object_inds.size), indexing='ij') human_rois_inds, object_rois_inds = human_rois_inds.reshape( -1), object_rois_inds.reshape(-1) # triplet labels action_labels = gt_action_mat[rois_to_gt_ind[ rois_human_inds[human_rois_inds]], rois_to_gt_ind[ rois_object_inds[object_rois_inds]]] # (hN' x oN') x 26 x 2 interaction_action_mask = np.array(cfg.VCOCO.ACTION_MASK).T # convert to 24-class # action_labels: (hN' x oN') x 24 # interaction_affinity: (hN' x oN') x 1 # init_part_attens: (hN' x oN') x 7 x 17 (last dimension is the holistic atten which is all 1) action_labels = action_labels[:, np.where(interaction_action_mask > 0)[0], np.where(interaction_action_mask > 0)[1]] interaction_affinity = np.any( action_labels.reshape(action_labels.shape[0], -1) > 0, 1) # info for training union_boxes = box_utils.get_union_box( rois[rois_human_inds[human_rois_inds]][:, 1:], rois[rois_object_inds[object_rois_inds]][:, 1:]) union_boxes = np.concatenate((batch_idx * np.ones( (union_boxes.shape[0], 1), dtype=union_boxes.dtype), union_boxes), axis=1) relative_location = box_utils.bbox_transform_inv( rois[rois_human_inds[human_rois_inds]][:, 1:], rois[rois_object_inds[object_rois_inds]][:, 1:]) # sample fg/bg triplets fg_triplets_inds = np.where(np.sum(action_labels, axis=1) > 0)[0] bg_triplets_inds = np.setdiff1d(np.arange(action_labels.shape[0]), fg_triplets_inds) fg_triplets_num_this_image = min(int(triplets_num_per_image * 1 / 4.), fg_triplets_inds.size) if fg_triplets_inds.size > 0: fg_triplets_inds = npr.choice(fg_triplets_inds, size=fg_triplets_num_this_image, replace=False) bg_triplets_num_this_image = max(fg_triplets_num_this_image * 3, 1) bg_triplets_num_this_image = min(bg_triplets_num_this_image, bg_triplets_inds.size) if bg_triplets_inds.size > 0 and bg_triplets_num_this_image > 0: bg_triplets_inds = npr.choice(bg_triplets_inds, size=bg_triplets_num_this_image, replace=False) keep_triplets_inds = np.concatenate( (fg_triplets_inds, bg_triplets_inds)) else: keep_triplets_inds = fg_triplets_inds return_dict = dict( human_inds=human_rois_inds[keep_triplets_inds], object_inds=object_rois_inds[keep_triplets_inds], union_boxes=union_boxes[keep_triplets_inds], action_labels=action_labels[keep_triplets_inds], spatial_info=relative_location[keep_triplets_inds], interaction_affinity=interaction_affinity[keep_triplets_inds], ) return return_dict
def compute_targets(ex_rois, gt_rois, weights=(1.0, 1.0, 1.0, 1.0)): """Compute bounding-box regression targets for an image.""" return box_utils.bbox_transform_inv(ex_rois, gt_rois, weights).astype(np.float32, copy=False)
def forward(self, probs, anchor_deltas, img_info): """ Args: probs (Tensor): Classification probability of the anchors. anchor_deltas (Tensor): Anchor regression deltas. img_info (Tensor[3]): (height, width, scale) Returns: proposals (Tensor[N, 5]): Predicted region proposals in (0, x1, y1, x2, y2) format. 0 means these proposals are from the first image in the batch. """ # Algorithm: # # For each (H, W) location i: # Generate A anchors centered on cell i # Apply predicted anchor regression deltas at cell i to each of the A anchors # Clip predicted boxes to image # Remove predicted boxes with either height or width < threshold # Sort all (proposal, score) pairs by score from highest to lowest # Take top pre_nms_topN proposals before NMS # Apply NMS with threshold 0.7 to remaining proposals # Take after_nms_topN proposals after NMS assert probs.size(0) == 1, "Single batch only." cfg_key = "TRAIN" if self.training else "TEST" pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE # The first set of num_anchors channels are bg probs # The second set are the fg probs, which we want probs = probs[:, self.num_anchors:, :, :] # 1. Generate proposals from regression deltas and shifted anchors height, width = probs.shape[-2:] # Enumerate all shifts (NOTE: torch.meshgrid is different from np.meshgrid) shift_x = torch.arange(0, width) * self.feat_stride shift_y = torch.arange(0, height) * self.feat_stride shift_y, shift_x = torch.meshgrid(shift_y, shift_x) shift_x, shift_y = shift_x.contiguous(), shift_y.contiguous() shifts = torch.stack((shift_x.view(-1), shift_y.view(-1), shift_x.view(-1), shift_y.view(-1)), dim=1) shifts = shifts.type_as(probs) # Enumerate all shifted anchors: # Add A anchors (1, A, 4) to K shifts (K, 1, 4) to get shifted anchors (K, A, 4) # Reshape to (K * A, 4) shifted anchors A = self.num_anchors K = shifts.size(0) self.anchors = self.anchors.type_as(probs) anchors = self.anchors.view(1, A, 4) + shifts.view(1, K, 4).permute( 1, 0, 2) anchors = anchors.view(K * A, 4) # Permute and reshape predicted anchor regression deltas to the same order as the anchors: # Anchor deltas will be (1, 4 * A, H, W) format # Permute to (1, H, W, 4 * A) # Reshape to (1 * H * W * A, 4) anchor_deltas = anchor_deltas.permute(0, 2, 3, 1).contiguous().view(-1, 4) # Safe-guard for unexpected large dw or dh value. # Since our proposals are only human, some background region features will never # receive gradients from bbox regression. Thus their predictions may drift away. anchor_deltas[:, 2:].clamp_(-10, 10) # Same story for the scores: # Scores are (1, A, H, W) format # Permute to (1, H, W, A) # Reshape to (1 * H * W * A, 1) probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, 1) # Convert anchors into proposals via regression deltas proposals = bbox_transform_inv(anchors, anchor_deltas) # 2. Clip predicted proposals to image proposals = clip_boxes(proposals, img_info[:2]) # 3. Remove predicted boxes with either height or width < threshold # (NOTE: need to scale min_size with the input image scale stored in img_info[2]) widths = proposals[:, 2] - proposals[:, 0] + 1 heights = proposals[:, 3] - proposals[:, 1] + 1 min_size = min_size * img_info[2] keep = torch.nonzero((widths >= min_size) & (heights >= min_size))[:, 0] proposals = proposals[keep] probs = probs[keep] # 4. Sort all (proposal, score) pairs by score from highest to lowest # 5. Take top pre_nms_topN (e.g. 6000) order = probs.view(-1).argsort(descending=True) if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order] probs = probs[order] # 6. Apply nms (e.g. threshold = 0.7) # 7. Take after_nms_topN (e.g. 300) # 8. Return the top proposals keep = nms(proposals, probs.squeeze(1), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep] probs = probs[keep] # proposals: [img_id, x1, y1, x2, y2] # Our RPN implementation only supports a single input image, so all img_ids are 0. proposals = torch.cat( (torch.zeros(proposals.size(0), 1).type_as(probs), proposals), dim=1) return proposals
def compute_targets(ex_rois, gt_rois, weights=(1.0, 1.0, 1.0, 1.0)): """Compute bounding-box regression targets for an image.""" return box_utils.bbox_transform_inv(ex_rois, gt_rois, weights).astype( np.float32, copy=False )