def CornerLoss(self, gt_bboxes, predicted_bboxes, symmetric=True): """Corner regularization loss. This function computes the corner loss, an alternative regression loss for box residuals. This was used in the Frustum-PointNets paper [1]. We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1 loss between the corners of the predicted boxes and ground truth. Hence, this loss can help encourage the model to maximize the IoU of the predictions. [1] Frustum PointNets for 3D Object Detection from RGB-D Data https://arxiv.org/pdf/1711.08488.pdf Args: gt_bboxes: tf.float32 of shape [..., 7] which contains (x, y, z, dx, dy, dz, phi), corresponding to ground truth bbox parameters. predicted_bboxes: tf.float32 of same shape as gt_bboxes containing predicted bbox parameters. symmetric: boolean. If True, computes the minimum of the corner loss with respect to both the gt box and the gt box rotated 180 degrees. Returns: tf.float32 Tensor of shape [...] where each entry contains the corner loss for the corresponding bbox. """ bbox_shape = py_utils.GetShape(gt_bboxes) batch_size = bbox_shape[0] gt_bboxes = tf.reshape(gt_bboxes, [batch_size, -1, 7]) predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, -1, 7]) gt_corners = geometry.BBoxCorners(gt_bboxes) predicted_corners = geometry.BBoxCorners(predicted_bboxes) corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1) huber_loss = self.ScaledHuberLoss(labels=tf.zeros_like(corner_dist), predictions=corner_dist) huber_loss = tf.reduce_sum(huber_loss, axis=-1) if symmetric: # Compute the loss assuming the ground truth is flipped 180, and # take the minimum of the two losses. rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32) rotated_gt_bboxes = gt_bboxes + rot rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes) rotated_corner_dist = tf.norm(predicted_corners - rotated_gt_corners, axis=-1) rotated_huber_loss = self.ScaledHuberLoss( labels=tf.zeros_like(rotated_corner_dist), predictions=rotated_corner_dist) rotated_huber_loss = tf.reduce_sum(rotated_huber_loss, axis=-1) huber_loss = tf.minimum(huber_loss, rotated_huber_loss) huber_loss = tf.reshape(huber_loss, bbox_shape[:-1]) return huber_loss
def testBBoxCorners(self): # Create four bounding boxes, two identical in each batch. # # This tests both that the batching and number of box dimensions are handled # properly. bboxes = tf.constant([[[1, 2, 3, 4, 3, 6, 0.], [1, 2, 3, 4, 3, 6, 0.]], [[1, 2, 3, 4, 3, 6, np.pi / 2.], [1, 2, 3, 4, 3, 6, np.pi / 2.]]]) corners = geometry.BBoxCorners(bboxes) with self.session() as sess: corners_np = sess.run(corners) self.assertEqual((2, 2, 8, 3), corners_np.shape) # Extrema of first two boxes are ([-1, 3], [0.5, 3.5], [0, 6]) for i in [0, 1]: self.assertAllClose(-1, np.min(corners_np[0, i, :, 0])) self.assertAllClose(3, np.max(corners_np[0, i, :, 0])) self.assertAllClose(0.5, np.min(corners_np[0, i, :, 1])) self.assertAllClose(3.5, np.max(corners_np[0, i, :, 1])) self.assertAllClose(0, np.min(corners_np[0, i, :, 2])) self.assertAllClose(6, np.max(corners_np[0, i, :, 2])) # Extrema of second two boxes is ([-0.5, 2.5], [0, 4], [0, 6]) # because it's the first box rotated by 90 degrees. for i in [0, 1]: self.assertAllClose(-0.5, np.min(corners_np[1, i, :, 0])) self.assertAllClose(2.5, np.max(corners_np[1, i, :, 0])) self.assertAllClose(0, np.min(corners_np[1, i, :, 1])) self.assertAllClose(4, np.max(corners_np[1, i, :, 1])) self.assertAllClose(0, np.min(corners_np[1, i, :, 2])) self.assertAllClose(6, np.max(corners_np[1, i, :, 2]))
def testVeloToImagePlaneTransformation(self): objects = kitti_data.LoadLabelFile(self._label_file) calib = kitti_data.LoadCalibrationFile(self._calib_file) # Only apply to object 0. obj = objects[0] bbox3d = kitti_data._KITTIObjectToBBox3D( obj, kitti_data.CameraToVeloTransformation(calib)) # Convert to corners in our canonical space. corners = geometry.BBoxCorners( tf.constant([[bbox3d]], dtype=tf.float32)) with self.session(): corners_np = self.evaluate(corners) corners_np = corners_np.reshape([8, 3]) # Add homogenous coordinates. corners_np = np.concatenate([corners_np, np.ones((8, 1))], axis=-1) # Apply the velo to image plane transformation. velo_to_img = kitti_data.VeloToImagePlaneTransformation(calib) corners_np = np.dot(corners_np, velo_to_img.T) # Divide by the last coordinate to recover pixel locations. corners_np[:, 0] /= corners_np[:, 2] corners_np[:, 1] /= corners_np[:, 2] # Obtain 2D bbox. min_x = np.min(corners_np[:, 0]) max_x = np.max(corners_np[:, 0]) min_y = np.min(corners_np[:, 1]) max_y = np.max(corners_np[:, 1]) bbox = [min_x, min_y, max_x, max_y] # left, top, right, bottom. # This should correspond to the GT bbox in obj['bbox']. # We use atol=0.1 here since they should close to the nearest pixel. self.assertAllClose(bbox, obj['bbox'], atol=0.1)
def _Extract(self, features): p = self.params source_id = py_utils.HasShape(features['image/source_id'], []) xmin = _Dense(features['object/image/bbox/xmin']) xmax = _Dense(features['object/image/bbox/xmax']) ymin = _Dense(features['object/image/bbox/ymin']) ymax = _Dense(features['object/image/bbox/ymax']) # 2d bounding box in image coordinates. bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=1) bboxes_count = tf.shape(bboxes)[0] bboxes = py_utils.PadOrTrimTo(bboxes, [p.max_num_objects, 4]) bboxes_padding = 1.0 - py_utils.PadOrTrimTo(tf.ones([bboxes_count]), [p.max_num_objects]) dim_xyz = tf.reshape(_Dense(features['object/velo/bbox/dim_xyz']), [-1, 3]) loc_xyz = tf.reshape(_Dense(features['object/velo/bbox/xyz']), [-1, 3]) phi = tf.reshape(_Dense(features['object/velo/bbox/phi']), [-1, 1]) # bboxes_3d is in [x, y, z, dx, dy, dz, phi]. bboxes_3d = tf.concat([loc_xyz, dim_xyz, phi], axis=1) cx, cy, _, dx, dy, _, _ = tf.unstack(bboxes_3d, num=7, axis=-1) bboxes_td = tf.stack([ cy - dy / 2, cx - dx / 2, cy + dy / 2, cx + dx / 2, ], axis=-1) # pyformat: disable bboxes_td = py_utils.PadOrTrimTo(bboxes_td, [p.max_num_objects, 4]) has_3d_info = tf.cast(_Dense(features['object/has_3d_info']), tf.float32) bboxes_3d_mask = py_utils.PadOrTrimTo(has_3d_info, [p.max_num_objects]) bboxes_td_mask = bboxes_3d_mask # Fill in difficulties from bounding box height, truncation and occlusion. bb_height = ymax - ymin box_image_height = py_utils.PadOrTrimTo(bb_height, [p.max_num_objects]) box_image_height *= bboxes_3d_mask # 0 to 3 indicating occlusion level. 0 means fully visible, 1 means partly, occlusion = tf.reshape(_Dense(features['object/occlusion']), [-1]) occlusion = tf.cast(occlusion, tf.float32) occlusion = py_utils.PadOrTrimTo(occlusion, [p.max_num_objects]) occlusion *= bboxes_3d_mask # Truncation: 0 -> not truncated, 1.0 -> truncated truncation = tf.reshape(_Dense(features['object/truncation']), [-1]) truncation = py_utils.PadOrTrimTo(truncation, [p.max_num_objects]) truncation *= bboxes_3d_mask difficulties = ComputeKITTIDifficulties(box_image_height, occlusion, truncation) difficulties = py_utils.PadOrTrimTo(difficulties, [p.max_num_objects]) # Make a batch axis to call BBoxCorners, and take the first result back. bbox3d_corners = geometry.BBoxCorners(bboxes_3d[tf.newaxis, ...])[0] # Project the 3D bbox to the image plane. velo_to_image_plane = features['transform/velo_to_image_plane'] bboxes3d_proj_to_image_plane = geometry.PointsToImagePlane( tf.reshape(bbox3d_corners, [-1, 3]), velo_to_image_plane) # Output is [num_objects, 8 corners per object, (x, y)]. bboxes3d_proj_to_image_plane = tf.reshape(bboxes3d_proj_to_image_plane, [-1, 8, 2]) bboxes3d_proj_to_image_plane = py_utils.PadOrTrimTo( bboxes3d_proj_to_image_plane, [p.max_num_objects, 8, 2]) texts = features['object/label'].values labels = ops.static_map_string_int(x=texts, keys=self.KITTI_CLASS_NAMES) labels = py_utils.PadOrTrimTo(labels, [p.max_num_objects]) texts = py_utils.PadOrTrimTo(texts, [p.max_num_objects]) # Filter labels by setting bboxes_padding, bboxes_3d_mask, and # bboxes_td_mask appropriately. if p.filter_labels is not None: valid_labels = tf.constant([p.filter_labels]) bbox_mask = tf.reduce_any(tf.equal(tf.expand_dims(labels, 1), valid_labels), axis=1) bbox_mask = tf.cast(bbox_mask, tf.float32) bboxes_padding = 1 - bbox_mask * (1 - bboxes_padding) filtered_bboxes_3d_mask = bboxes_3d_mask * bbox_mask bboxes_td_mask *= bbox_mask else: filtered_bboxes_3d_mask = bboxes_3d_mask # Placeholder for counting the number of laser points that reside within # each 3-d bounding box. This must be filled in outside of this function # based on the loaded 3-d laser points. bboxes_3d_num_points = tf.zeros([p.max_num_objects], dtype=tf.int32) bboxes_3d_num_points = py_utils.PadOrTrimTo(bboxes_3d_num_points, [p.max_num_objects]) # Pad bboxes_3d. bboxes_3d = py_utils.PadOrTrimTo(bboxes_3d, [p.max_num_objects, 7]) return py_utils.NestedMap( source_id=source_id, bboxes_count=bboxes_count, bboxes=bboxes, bboxes_padding=bboxes_padding, bboxes_3d=bboxes_3d, bboxes_3d_mask=filtered_bboxes_3d_mask, unfiltered_bboxes_3d_mask=bboxes_3d_mask, bboxes3d_proj_to_image_plane=bboxes3d_proj_to_image_plane, bboxes_td=bboxes_td, bboxes_td_mask=bboxes_td_mask, bboxes_3d_num_points=bboxes_3d_num_points, labels=labels, texts=texts, box_image_height=box_image_height, occlusion=occlusion, truncation=truncation, difficulties=difficulties)
def ProcessOutputs(self, input_batch, model_outputs): """Produce additional decoder outputs for KITTI. Args: input_batch: A .NestedMap of the inputs to the model. model_outputs: A .NestedMap of the outputs of the model, including:: - per_class_predicted_bboxes: [batch, num_classes, num_boxes, 7] float Tensor with per class 3D (7 DOF) bounding boxes. - per_class_predicted_bbox_scores: [batch, num_classes, num_boxes] float Tensor with per class, per box scores. - per_class_valid_mask: [batch, num_classes, num_boxes] masking Tensor indicating which boxes were still kept after NMS for each class. Returns: A NestedMap of additional decoder outputs needed for PostProcessDecodeOut. """ p = self.params per_class_predicted_bboxes = model_outputs.per_class_predicted_bboxes batch_size, num_classes, num_boxes, _ = py_utils.GetShape( per_class_predicted_bboxes) flattened_num_boxes = num_classes * num_boxes input_labels = input_batch.decoder_copy.labels input_lasers = input_batch.decoder_copy.lasers input_images = input_batch.decoder_copy.images with tf.device('/cpu:0'): # Convert the predicted bounding box points to their corners # and then project them to the image plane. # # This output can be used to: # # A) Visualize bounding boxes (2d or 3d) on the camera image. # # B) Compute the height of the predicted boxes to filter 'too small' boxes # as is done in the KITTI eval. predicted_bboxes = tf.reshape(per_class_predicted_bboxes, [batch_size, flattened_num_boxes, 7]) bbox_corners = geometry.BBoxCorners(predicted_bboxes) bbox_corners = py_utils.HasShape(bbox_corners, [batch_size, flattened_num_boxes, 8, 3]) utils_3d = detection_3d_lib.Utils3D() bbox_corners_image = utils_3d.CornersToImagePlane( bbox_corners, input_images.velo_to_image_plane) bbox_corners_image = py_utils.HasShape( bbox_corners_image, [batch_size, flattened_num_boxes, 8, 2]) # Clip the bounding box corners so they remain within # the image coordinates. bbox2d_corners_image_clipped = self._BBox2DImage(bbox_corners_image, input_images) bbox2d_corners_image_clipped = py_utils.HasShape( bbox2d_corners_image_clipped, [batch_size, flattened_num_boxes, 4]) # Compute the frustum mask to filter out bounding boxes that # are 'outside the frustum'. frustum_mask = self._CreateFrustumMask(bbox_corners_image, bbox2d_corners_image_clipped, input_images.height, input_images.width) # Reshape all of these back to [batch_size, num_classes, num_boxes, ...] bbox_corners_image = tf.reshape( bbox_corners_image, [batch_size, num_classes, num_boxes, 8, 2]) bbox2d_corners_image_clipped = tf.reshape( bbox2d_corners_image_clipped, [batch_size, num_classes, num_boxes, 4]) frustum_mask = tf.reshape(frustum_mask, [batch_size, num_classes, num_boxes]) ret = py_utils.NestedMap({ # For mAP eval 'source_ids': input_labels.source_id, 'difficulties': input_labels.difficulties, 'num_points_in_bboxes': input_batch.labels.bboxes_3d_num_points, # For exporting. 'velo_to_image_plane': input_images.velo_to_image_plane, 'velo_to_camera': input_images.velo_to_camera, # Predictions. 'bbox_corners_image': bbox_corners_image, 'bbox2d_corners_image': bbox2d_corners_image_clipped, 'frustum_mask': frustum_mask, # Ground truth. 'bboxes_3d': input_labels.bboxes_3d, 'bboxes_3d_mask': input_labels.bboxes_3d_mask, 'unfiltered_bboxes_3d_mask': input_labels.unfiltered_bboxes_3d_mask, 'labels': input_labels.labels, }) laser_sample = self._SampleLaserForVisualization( input_lasers.points_xyz, input_lasers.points_padding) ret.update(laser_sample) if p.summarize_boxes_on_image: ret.camera_images = input_images.image return ret
def CornerLoss(self, gt_bboxes, predicted_bboxes): """Corner regularization loss. This function computes the corner loss, an alternative regression loss for box residuals. This was used in the Frustum-PointNets paper [1]. We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1 loss between the corners of the predicted boxes and ground truth. Hence, this loss can help encourage the model to maximize the IoU of the predictions. [1] Frustum PointNets for 3D Object Detection from RGB-D Data https://arxiv.org/pdf/1711.08488.pdf TODO(bcyang): support arbitrary input shapes [..., 7]. Args: gt_bboxes: tf.float32 of shape [batch_size, num_centers, num_anchor_bboxes_per_center, 7] which contains (x, y, z, dx, dy, dz, phi), corresponding to ground truth bbox parameters. predicted_bboxes: tf.float32 of same shape as gt_bboxes containing predicted bbox parameters. Returns: tf.float32 Tensor of shape [batch_size, num_centers, num_anchor_bboxes_per_center] where each entry contains the corner loss for the corresponding bbox. """ batch_size, num_centers, num_anchor_bboxes_per_center = py_utils.GetShape( gt_bboxes, 3) gt_bboxes = py_utils.HasShape( gt_bboxes, [batch_size, num_centers, num_anchor_bboxes_per_center, 7]) predicted_bboxes = py_utils.HasShape( predicted_bboxes, [batch_size, num_centers, num_anchor_bboxes_per_center, 7]) gt_bboxes = tf.reshape( gt_bboxes, [batch_size, num_centers * num_anchor_bboxes_per_center, 7]) predicted_bboxes = tf.reshape( predicted_bboxes, [batch_size, num_centers * num_anchor_bboxes_per_center, 7]) rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32) rotated_gt_bboxes = gt_bboxes + rot gt_corners = geometry.BBoxCorners(gt_bboxes) rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes) predicted_corners = geometry.BBoxCorners(predicted_bboxes) corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1) rotated_corner_dist = tf.norm( predicted_corners - rotated_gt_corners, axis=-1) total_dist = tf.reduce_sum(corner_dist, axis=-1) rotated_total_dist = tf.reduce_sum(rotated_corner_dist, axis=-1) min_dist = tf.minimum(total_dist, rotated_total_dist) huber_loss = self.ScaledHuberLoss( labels=tf.zeros_like(total_dist), predictions=min_dist) huber_loss = tf.reshape( huber_loss, [batch_size, num_centers, num_anchor_bboxes_per_center]) return huber_loss