def single_image_nms_fn(args): """Runs NMS on a single image and returns padded output.""" (per_image_boxes, per_image_scores, per_image_masks, per_image_num_valid_boxes) = args per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([per_image_num_valid_boxes, -1, -1, -1])), [ -1, q, per_image_masks.shape[2].value, per_image_masks.shape[3].value ]) nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, masks=per_image_masks, clip_window=clip_window, change_coordinate_frame=change_coordinate_frame) padded_boxlist = box_list_ops.pad_or_clip_box_list( nmsed_boxlist, max_total_size) num_detections = nmsed_boxlist.num_boxes() nmsed_boxes = padded_boxlist.get() nmsed_scores = padded_boxlist.get_field( fields.BoxListFields.scores) nmsed_classes = padded_boxlist.get_field( fields.BoxListFields.classes) nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks) return [ nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, num_detections ]
def test_pad_box_list(self): boxlist = box_list.BoxList( tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]], tf.float32)) boxlist.add_field('classes', tf.constant([0, 1])) boxlist.add_field('scores', tf.constant([0.75, 0.2])) num_boxes = 4 padded_boxlist = box_list_ops.pad_or_clip_box_list(boxlist, num_boxes) expected_boxes = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], [0, 0, 0, 0], [0, 0, 0, 0]] expected_classes = [0, 1, 0, 0] expected_scores = [0.75, 0.2, 0, 0] with self.test_session() as sess: boxes_out, classes_out, scores_out = sess.run( [padded_boxlist.get(), padded_boxlist.get_field('classes'), padded_boxlist.get_field('scores')]) self.assertAllClose(expected_boxes, boxes_out) self.assertAllEqual(expected_classes, classes_out) self.assertAllClose(expected_scores, scores_out)
def compute_loss(): sampled_boxlist = box_list_ops.boolean_mask( detection_boxlist, sampled_indices) sampled_padded_boxlist = box_list_ops.pad_or_clip_box_list( sampled_boxlist, num_boxes=self.batch_size) detection_boxes = sampled_padded_boxlist.get() detection_transcriptions = sampled_padded_boxlist.get_field( fields.BoxListFields.transcription) # detection_transcriptions = tf.Print(detection_transcriptions, [detection_transcriptions], message="These are the subsampled GTs transcr.", summarize=99999) detection_scores = sampled_padded_boxlist.get_field( fields.BoxListFields.scores) num_detections = tf.minimum(sampled_boxlist.num_boxes(), self.batch_size) transcriptions_dict, eval_metric_ops = self._predict_lstm( rpn_features_to_crop, detection_boxes, detection_transcriptions, detection_scores, num_detections) return [ self.loss(transcriptions_dict), (transcriptions_dict, eval_metric_ops) ]
def _single_image_nms_fn(args): """Runs NMS on a single image and returns padded output. Args: args: A list of tensors consisting of the following: per_image_boxes - A [num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. per_image_scores - A [num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. per_image_clip_window - A 1D float32 tensor of the form [ymin, xmin, ymax, xmax] representing the window to clip the boxes to. per_image_additional_fields - (optional) A variable number of float32 tensors each with size [num_anchors, ...]. per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. Returns: 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) A variable number of float32 tensors each with size [max_detections, ...] corresponding to the input `per_image_additional_fields`. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. """ per_image_boxes = args[0] per_image_scores = args[1] per_image_masks = args[2] per_image_clip_window = args[3] per_image_additional_fields = { key: value for key, value in zip(additional_fields, args[4:-1]) } per_image_num_valid_boxes = args[-1] if use_static_shapes: total_proposals = tf.shape(per_image_scores) per_image_scores = tf.where( tf.less(tf.range(total_proposals[0]), per_image_num_valid_boxes), per_image_scores, tf.fill(total_proposals, np.finfo('float32').min)) else: per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([per_image_num_valid_boxes, -1, -1, -1])), [-1, q, per_image_masks.shape[2].value, per_image_masks.shape[3].value]) if per_image_additional_fields is not None: for key, tensor in per_image_additional_fields.items(): additional_field_shape = tensor.get_shape() additional_field_dim = len(additional_field_shape) per_image_additional_fields[key] = tf.reshape( tf.slice(per_image_additional_fields[key], additional_field_dim * [0], tf.stack([per_image_num_valid_boxes] + (additional_field_dim - 1) * [-1])), [-1] + [dim.value for dim in additional_field_shape[1:]]) nmsed_boxlist, num_valid_nms_boxes = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, masks=per_image_masks, pad_to_max_output_size=use_static_shapes, additional_fields=per_image_additional_fields) if not use_static_shapes: nmsed_boxlist = box_list_ops.pad_or_clip_box_list( nmsed_boxlist, max_total_size) num_detections = num_valid_nms_boxes nmsed_boxes = nmsed_boxlist.get() nmsed_scores = nmsed_boxlist.get_field(fields.BoxListFields.scores) nmsed_classes = nmsed_boxlist.get_field(fields.BoxListFields.classes) nmsed_masks = nmsed_boxlist.get_field(fields.BoxListFields.masks) nmsed_additional_fields = [ nmsed_boxlist.get_field(key) for key in per_image_additional_fields ] return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + nmsed_additional_fields + [num_detections])
def batch_multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, num_valid_boxes=None, masks=None, scope=None): """Multi-class version of non maximum suppression that operates on a batch. This op is similar to `multiclass_non_max_suppression` but operates on a batch of boxes and scores. See documentation for `multiclass_non_max_suppression` for details. Args: boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. scores: A [batch_size, num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip boxes to before performing non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. scope: tf scope name. Returns: A dictionary containing the following entries: 'detection_boxes': A [batch_size, max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'detection_scores': A [bath_size, max_detections] float32 tensor containing the scores for the boxes. 'detection_classes': A [batch_size, max_detections] float32 tensor containing the class for boxes. 'num_detections': A [batchsize] float32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. the rest of the entries are zero paddings. 'detection_masks': (optional) a [batch_size, max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ q = boxes.shape[2].value num_classes = scores.shape[2].value if q != 1 and q != num_classes: raise ValueError('third dimension of boxes must be either 1 or equal ' 'to the third dimension of scores') with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): per_image_boxes_list = tf.unstack(boxes) per_image_scores_list = tf.unstack(scores) num_valid_boxes_list = len(per_image_boxes_list) * [None] per_image_masks_list = len(per_image_boxes_list) * [None] if num_valid_boxes is not None: num_valid_boxes_list = tf.unstack(num_valid_boxes) if masks is not None: per_image_masks_list = tf.unstack(masks) detection_boxes_list = [] detection_scores_list = [] detection_classes_list = [] num_detections_list = [] detection_masks_list = [] for (per_image_boxes, per_image_scores, per_image_masks, num_valid_boxes) in zip(per_image_boxes_list, per_image_scores_list, per_image_masks_list, num_valid_boxes_list): if num_valid_boxes is not None: per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([num_valid_boxes, -1])), [-1, num_classes]) if masks is not None: per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([num_valid_boxes, -1, -1, -1])), [-1, q, masks.shape[3].value, masks.shape[4].value]) nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, masks=per_image_masks, clip_window=clip_window, change_coordinate_frame=change_coordinate_frame) num_detections_list.append(tf.to_float(nmsed_boxlist.num_boxes())) padded_boxlist = box_list_ops.pad_or_clip_box_list( nmsed_boxlist, max_total_size) detection_boxes_list.append(padded_boxlist.get()) detection_scores_list.append( padded_boxlist.get_field(fields.BoxListFields.scores)) detection_classes_list.append( padded_boxlist.get_field(fields.BoxListFields.classes)) if masks is not None: detection_masks_list.append( padded_boxlist.get_field(fields.BoxListFields.masks)) nms_dict = { 'detection_boxes': tf.stack(detection_boxes_list), 'detection_scores': tf.stack(detection_scores_list), 'detection_classes': tf.stack(detection_classes_list), 'num_detections': tf.stack(num_detections_list) } if masks is not None: nms_dict['detection_masks'] = tf.stack(detection_masks_list) return nms_dict