def test_fails_with_nested_input(self): def fn(input_tensor): return input_tensor input_tensor1 = tf.constant([1]) input_tensor2 = tf.constant([2]) with self.assertRaisesRegexp( ValueError, '`elems` must be a Tensor or list of Tensors.'): shape_utils.static_or_dynamic_map_fn( fn, [input_tensor1, [input_tensor2]], dtype=tf.float32)
def test_with_multiple_dynamic_shapes(self): def fn(elems): input_tensor, scalar_index_tensor = elems return tf.reshape(tf.slice(input_tensor, scalar_index_tensor, [1]), []) input_tensor = tf.placeholder(tf.float32, shape=(None, 3)) scalar_index_tensor = tf.placeholder(tf.int32, shape=(None, 1)) map_fn_output = shape_utils.static_or_dynamic_map_fn( fn, [input_tensor, scalar_index_tensor], dtype=tf.float32) op_names = [op.name for op in tf.get_default_graph().get_operations()] self.assertTrue(any(['map' == op_name[:3] for op_name in op_names])) with self.test_session() as sess: result1 = sess.run( map_fn_output, feed_dict={ input_tensor: [[1, 2, 3], [4, 5, -1], [0, 6, 9]], scalar_index_tensor: [[0], [2], [1]], }) result2 = sess.run( map_fn_output, feed_dict={ input_tensor: [[-1, 1, 0], [3, 9, 30]], scalar_index_tensor: [[1], [0]] }) self.assertAllEqual(result1, [1, -1, 6]) self.assertAllEqual(result2, [1, 3])
def graph_fn(): input_tensor = tf.constant([[1, 2, 3], [4, 5, -1], [0, 6, 9]], dtype=tf.float32) scalar_index_tensor = tf.constant([[0], [2], [1]], dtype=tf.int32) map_fn_output = shape_utils.static_or_dynamic_map_fn( fn, [input_tensor, scalar_index_tensor], dtype=tf.float32) return map_fn_output
def normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=32): """Converts a batch of boxes from normal to image coordinates. Args: normalized_boxes: a float32 tensor of shape [None, num_boxes, 4] in normalized coordinates. image_shape: a float32 tensor of shape [4] containing the image shape. parallel_iterations: parallelism for the map_fn op. Returns: absolute_boxes: a float32 tensor of shape [None, num_boxes, 4] containg the boxes in image coordinates. """ def _to_absolute_coordinates(normalized_boxes): return box_list_ops.to_absolute_coordinates( box_list.BoxList(normalized_boxes), image_shape[1], image_shape[2], check_range=False).get() absolute_boxes = shape_utils.static_or_dynamic_map_fn( _to_absolute_coordinates, elems=(normalized_boxes), dtype=tf.float32, parallel_iterations=parallel_iterations, back_prop=True) return absolute_boxes
def normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=32): """Converts a batch of boxes from normal to image coordinates. Args: normalized_boxes: a tensor of shape [None, num_boxes, 4] in normalized coordinates. The dtype of this tensor must support tf.mul. image_shape: a tensor of shape [4] containing the image shape, with same dtype as `normalized_boxes`. parallel_iterations: parallelism for the map_fn op. Returns: absolute_boxes: a tensor of shape [None, num_boxes, 4] containing the boxes in image coordinates, with same dtype as `normalized_boxes`. """ x_scale = tf.cast(image_shape[2], normalized_boxes.dtype) y_scale = tf.cast(image_shape[1], normalized_boxes.dtype) def _to_absolute_coordinates(normalized_boxes): y_min, x_min, y_max, x_max = tf.split( value=normalized_boxes, num_or_size_splits=4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxes = tf.concat([y_min, x_min, y_max, x_max], 1) return scaled_boxes absolute_boxes = shape_utils.static_or_dynamic_map_fn( _to_absolute_coordinates, elems=(normalized_boxes), dtype=normalized_boxes.dtype, parallel_iterations=parallel_iterations, back_prop=True) return absolute_boxes
def normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=32): """Converts a batch of boxes from normal to image coordinates. Args: normalized_boxes: a float32 tensor of shape [None, num_boxes, 4] in normalized coordinates. image_shape: a float32 tensor of shape [4] containing the image shape. parallel_iterations: parallelism for the map_fn op. Returns: absolute_boxes: a float32 tensor of shape [None, num_boxes, 4] containg the boxes in image coordinates. """ def _to_absolute_coordinates(normalized_boxes): return box_list_ops.to_absolute_coordinates( box_list.BoxList(normalized_boxes), image_shape[1], image_shape[2], check_range=False).get() absolute_boxes = shape_utils.static_or_dynamic_map_fn( _to_absolute_coordinates, elems=(normalized_boxes), dtype=tf.float32, parallel_iterations=parallel_iterations, back_prop=True) return absolute_boxes
def normalized_to_image_coordinates(normalized_boxes, image_shape, parallel_iterations=32): """Converts a batch of boxes from normal to image coordinates. Args: normalized_boxes: a float32 tensor of shape [None, num_boxes, 4] in normalized coordinates. image_shape: a float32 tensor of shape [4] containing the image shape. parallel_iterations: parallelism for the map_fn op. Returns: absolute_boxes: a float32 tensor of shape [None, num_boxes, 4] containing the boxes in image coordinates. """ x_scale = tf.cast(image_shape[2], tf.float32) y_scale = tf.cast(image_shape[1], tf.float32) def _to_absolute_coordinates(normalized_boxes): y_min, x_min, y_max, x_max = tf.split( value=normalized_boxes, num_or_size_splits=4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxes = tf.concat([y_min, x_min, y_max, x_max], 1) return scaled_boxes absolute_boxes = shape_utils.static_or_dynamic_map_fn( _to_absolute_coordinates, elems=(normalized_boxes), dtype=tf.float32, parallel_iterations=parallel_iterations, back_prop=True) return absolute_boxes
def _tf_example_input_placeholder(input_shape=None): """Returns input that accepts a batch of strings with tf examples. Args: input_shape: the shape to resize the output decoded images to (optional). Returns: a tuple of input placeholder and the output decoded images. """ batch_tf_example_placeholder = tf.placeholder(tf.string, shape=[None], name='tf_example') def decode(tf_example_string_tensor): tensor_dict = tf_example_decoder.TfExampleDecoder().decode( tf_example_string_tensor) image_tensor = tensor_dict[fields.InputDataFields.image] if input_shape is not None: image_tensor = tf.image.resize(image_tensor, input_shape[1:3]) return image_tensor return (batch_tf_example_placeholder, shape_utils.static_or_dynamic_map_fn( decode, elems=batch_tf_example_placeholder, dtype=tf.uint8, parallel_iterations=32, back_prop=False))
def test_with_static_shape(self): def fn(input_tensor): return tf.reduce_sum(input_tensor) input_tensor = tf.constant([[1, 2], [3, 1], [0, 4]], dtype=tf.float32) map_fn_output = shape_utils.static_or_dynamic_map_fn(fn, input_tensor) op_names = [op.name for op in tf.get_default_graph().get_operations()] self.assertTrue(all(['map' != op_name[:3] for op_name in op_names])) with self.test_session() as sess: result = sess.run(map_fn_output) self.assertAllEqual(result, [3, 4, 4])
def test_with_multiple_static_shapes(self): def fn(elems): input_tensor, scalar_index_tensor = elems return tf.reshape(tf.slice(input_tensor, scalar_index_tensor, [1]), []) input_tensor = tf.constant([[1, 2, 3], [4, 5, -1], [0, 6, 9]], dtype=tf.float32) scalar_index_tensor = tf.constant([[0], [2], [1]], dtype=tf.int32) map_fn_output = shape_utils.static_or_dynamic_map_fn( fn, [input_tensor, scalar_index_tensor], dtype=tf.float32) op_names = [op.name for op in tf.get_default_graph().get_operations()] self.assertTrue(all(['map' != op_name[:3] for op_name in op_names])) with self.test_session() as sess: result = sess.run(map_fn_output) self.assertAllEqual(result, [1, -1, 6])
def batch_position_sensitive_crop_regions(images, boxes, crop_size, num_spatial_bins, global_pool, parallel_iterations=64): """Position sensitive crop with batches of images and boxes. This op is exactly like `position_sensitive_crop_regions` below but operates on batches of images and boxes. See `position_sensitive_crop_regions` function below for the operation applied per batch element. Args: images: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`. A 4-D tensor of shape `[batch, image_height, image_width, depth]`. Both `image_height` and `image_width` need to be positive. boxes: A `Tensor` of type `float32`. A 3-D tensor of shape `[batch, num_boxes, 4]`. Each box is specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image height is mapped to `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. crop_size: See `position_sensitive_crop_regions` below. num_spatial_bins: See `position_sensitive_crop_regions` below. global_pool: See `position_sensitive_crop_regions` below. parallel_iterations: Number of batch items to process in parallel. Returns: """ def _position_sensitive_crop_fn(inputs): images, boxes = inputs return position_sensitive_crop_regions( images, boxes, crop_size=crop_size, num_spatial_bins=num_spatial_bins, global_pool=global_pool) return shape_utils.static_or_dynamic_map_fn( _position_sensitive_crop_fn, elems=[images, boxes], dtype=tf.float32, parallel_iterations=parallel_iterations)
def batch_position_sensitive_crop_regions(images, boxes, crop_size, num_spatial_bins, global_pool, parallel_iterations=64): """Position sensitive crop with batches of images and boxes. This op is exactly like `position_sensitive_crop_regions` below but operates on batches of images and boxes. See `position_sensitive_crop_regions` function below for the operation applied per batch element. Args: images: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`. A 4-D tensor of shape `[batch, image_height, image_width, depth]`. Both `image_height` and `image_width` need to be positive. boxes: A `Tensor` of type `float32`. A 3-D tensor of shape `[batch, num_boxes, 4]`. Each box is specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image height is mapped to `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. crop_size: See `position_sensitive_crop_regions` below. num_spatial_bins: See `position_sensitive_crop_regions` below. global_pool: See `position_sensitive_crop_regions` below. parallel_iterations: Number of batch items to process in parallel. Returns: """ def _position_sensitive_crop_fn(inputs): images, boxes = inputs return position_sensitive_crop_regions( images, boxes, crop_size=crop_size, num_spatial_bins=num_spatial_bins, global_pool=global_pool) return shape_utils.static_or_dynamic_map_fn( _position_sensitive_crop_fn, elems=[images, boxes], dtype=tf.float32, parallel_iterations=parallel_iterations)
def giou(boxes1, boxes2): """Computes generalized IOU between two tensors. Each box should be represented as [ymin, xmin, ymax, xmax]. Args: boxes1: a tensor with shape [num_boxes, 4] boxes2: a tensor with shape [num_boxes, 4] Returns: a tensor of shape [num_boxes] containing GIoUs """ def _two_boxes_giou(boxes): """Compute giou between two boxes.""" boxes1, boxes2 = boxes pred_ymin, pred_xmin, pred_ymax, pred_xmax = tf.unstack(boxes1) gt_ymin, gt_xmin, gt_ymax, gt_xmax = tf.unstack(boxes2) gt_area = (gt_ymax - gt_ymin) * (gt_xmax - gt_xmin) pred_area = (pred_ymax - pred_ymin) * (pred_xmax - pred_xmin) x1_i = tf.maximum(pred_xmin, gt_xmin) x2_i = tf.minimum(pred_xmax, gt_xmax) y1_i = tf.maximum(pred_ymin, gt_ymin) y2_i = tf.minimum(pred_ymax, gt_ymax) intersection_area = tf.maximum(0.0, y2_i - y1_i) * tf.maximum(0.0, x2_i - x1_i) x1_c = tf.minimum(pred_xmin, gt_xmin) x2_c = tf.maximum(pred_xmax, gt_xmax) y1_c = tf.minimum(pred_ymin, gt_ymin) y2_c = tf.maximum(pred_ymax, gt_ymax) hull_area = (y2_c - y1_c) * (x2_c - x1_c) union_area = gt_area + pred_area - intersection_area iou = tf.where( tf.equal(union_area, 0.0), 0.0, intersection_area / union_area) giou_ = iou - tf.where(hull_area > 0.0, (hull_area - union_area) / hull_area, iou) return giou_ return shape_utils.static_or_dynamic_map_fn(_two_boxes_giou, [boxes1, boxes2])
def test_with_dynamic_shape(self): def fn(input_tensor): return tf.reduce_sum(input_tensor) input_tensor = tf.placeholder(tf.float32, shape=(None, 2)) map_fn_output = shape_utils.static_or_dynamic_map_fn(fn, input_tensor) op_names = [op.name for op in tf.get_default_graph().get_operations()] self.assertTrue(any(['map' == op_name[:3] for op_name in op_names])) with self.test_session() as sess: result1 = sess.run( map_fn_output, feed_dict={ input_tensor: [[1, 2], [3, 1], [0, 4]]}) result2 = sess.run( map_fn_output, feed_dict={ input_tensor: [[-1, 1], [0, 9]]}) self.assertAllEqual(result1, [3, 4, 4]) self.assertAllEqual(result2, [0, 9])
def _tf_example_input_placeholder(): """Returns input that accepts a batch of strings with tf examples. Returns: a tuple of input placeholder and the output decoded images. """ batch_tf_example_placeholder = tf.placeholder( tf.string, shape=[None], name='tf_example') def decode(tf_example_string_tensor): tensor_dict = tf_example_decoder.TfExampleDecoder().decode( tf_example_string_tensor) image_tensor = tensor_dict[fields.InputDataFields.image] return image_tensor return (batch_tf_example_placeholder, shape_utils.static_or_dynamic_map_fn( decode, elems=batch_tf_example_placeholder, dtype=tf.uint8, parallel_iterations=32, back_prop=False))
def preprocess(self, inputs): """Feature-extractor specific preprocessing. SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during post-processing. On calling `preprocess` method, clip_window gets updated based on `true_image_shapes` returned by `image_resizer_fn`. Args: inputs: a [batch, height_in, width_in, channels] float tensor representing a batch of images with values between 0 and 255.0. Returns: preprocessed_inputs: a [batch, height_out, width_out, channels] float tensor representing a batch of images. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Raises: ValueError: if inputs tensor does not have type tf.float32 """ if inputs.dtype is not tf.float32: raise ValueError('`preprocess` expects a tf.float32 tensor') with tf.device('/cpu:0'): #Put Preprocessing on CPU #ADDED BY GUSTAVZ with tf.name_scope('Preprocessor'): # TODO(jonathanhuang): revisit whether to always use batch size as # the number of parallel iterations vs allow for dynamic batching. outputs = shape_utils.static_or_dynamic_map_fn( self._image_resizer_fn, elems=inputs, dtype=[tf.float32, tf.int32]) resized_inputs = outputs[0] true_image_shapes = outputs[1] return (self._feature_extractor.preprocess(resized_inputs), true_image_shapes)
def preprocess(self, inputs): """Feature-extractor specific preprocessing. SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during post-processing. On calling `preprocess` method, clip_window gets updated based on `true_image_shapes` returned by `image_resizer_fn`. Args: inputs: a [batch, height_in, width_in, channels] float tensor representing a batch of images with values between 0 and 255.0. Returns: preprocessed_inputs: a [batch, height_out, width_out, channels] float tensor representing a batch of images. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Raises: ValueError: if inputs tensor does not have type tf.float32 """ if inputs.dtype is not tf.float32: raise ValueError('`preprocess` expects a tf.float32 tensor') with tf.name_scope('Preprocessor'): # TODO: revisit whether to always use batch size as # the number of parallel iterations vs allow for dynamic batching. outputs = shape_utils.static_or_dynamic_map_fn( self._image_resizer_fn, elems=inputs, dtype=[tf.float32, tf.int32]) resized_inputs = outputs[0] true_image_shapes = outputs[1] return (self._feature_extractor.preprocess(resized_inputs), true_image_shapes)
def batch_multiclass_non_max_suppression(boxes, boxes_3d, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, num_valid_boxes=None, additional_fields=None, scope=None, use_static_shapes=False, parallel_iterations=32, use_class_agnostic_nms=False, max_classes_per_detection=1): """Multi-class version of non maximum suppression that operates on a batch. This op is similar to `multiclass_non_max_suppression` but operates on a batch of boxes and scores. See documentation for `multiclass_non_max_suppression` for details. Args: boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. scores: A [batch_size, num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. The scores have to be non-negative when use_static_shapes is set True. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of shape [batch_size, 4] where each entry is of the form [y_min, x_min, y_max, x_max] representing the window to clip boxes to before performing non-max suppression. This argument can also be a tensor of shape [4] in which case, the same clip window is applied to all images in the batch. If clip_widow is None, all boxes are used to perform non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose dimensions are [batch_size, num_anchors, ...]. scope: tf scope name. use_static_shapes: If true, the output nmsed boxes are padded to be of length `max_size_per_class` and it doesn't clip boxes to max_total_size. Defaults to false. parallel_iterations: (optional) number of batch items to process in parallel. use_class_agnostic_nms: If true, this uses class-agnostic non max suppression max_classes_per_detection: Maximum number of retained classes per detection box in class-agnostic NMS. Returns: 'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [batch_size, max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [batch_size, max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [batch_size, max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) a dictionary of [batch_size, max_detections, ...] float32 tensors corresponding to the tensors specified in the input `additional_fields`. This is not returned if input `additional_fields` is None. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. Raises: ValueError: if `q` in boxes.shape is not 1 or not equal to number of classes as inferred from scores.shape. """ q = shape_utils.get_dim_as_int(boxes.shape[2]) num_classes = shape_utils.get_dim_as_int(scores.shape[2]) if q != 1 and q != num_classes: raise ValueError('third dimension of boxes must be either 1 or equal ' 'to the third dimension of scores') if change_coordinate_frame and clip_window is None: raise ValueError( 'if change_coordinate_frame is True, then a clip_window' 'must be specified.') # Create ordered dictionary using the sorted keys from # additional fields to ensure getting the same key value assignment # in _single_image_nms_fn(). The dictionary is thus a sorted version of # additional_fields. if additional_fields is None: ordered_additional_fields = {} else: ordered_additional_fields = collections.OrderedDict( sorted(additional_fields.items(), key=lambda item: item[0])) del additional_fields with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): boxes_shape = boxes.shape batch_size = shape_utils.get_dim_as_int(boxes_shape[0]) num_anchors = shape_utils.get_dim_as_int(boxes_shape[1]) if batch_size is None: batch_size = tf.shape(boxes)[0] if num_anchors is None: num_anchors = tf.shape(boxes)[1] # If num valid boxes aren't provided, create one and mark all boxes as # valid. if num_valid_boxes is None: num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors if clip_window is None: clip_window = tf.stack([ tf.reduce_min(boxes[:, :, :, 0]), tf.reduce_min(boxes[:, :, :, 1]), tf.reduce_max(boxes[:, :, :, 2]), tf.reduce_max(boxes[:, :, :, 3]) ]) if clip_window.shape.ndims == 1: clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) def _single_image_nms_fn(args): """Runs NMS on a single image and returns padded output. Args: args: A list of tensors consisting of the following: per_image_boxes - A [num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. per_image_scores - A [num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. per_image_clip_window - A 1D float32 tensor of the form [ymin, xmin, ymax, xmax] representing the window to clip the boxes to. per_image_additional_fields - (optional) A variable number of float32 tensors each with size [num_anchors, ...]. per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. Returns: 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) A variable number of float32 tensors each with size [max_detections, ...] corresponding to the input `per_image_additional_fields`. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. """ per_image_boxes = args[0] per_image_boxes_3d = args[1] per_image_scores = args[2] per_image_clip_window = args[3] # Make sure that the order of elements passed in args is aligned with # the iteration order of ordered_additional_fields per_image_additional_fields = { key: value for key, value in zip(ordered_additional_fields, args[4:-1]) } per_image_num_valid_boxes = args[-1] if use_static_shapes: total_proposals = tf.shape(per_image_scores) per_image_scores = tf.where( tf.less(tf.range(total_proposals[0]), per_image_num_valid_boxes), per_image_scores, tf.fill(total_proposals, np.finfo('float32').min)) else: per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_boxes_3d = tf.reshape( tf.slice(per_image_boxes_3d, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 6]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) if per_image_additional_fields is not None: for key, tensor in per_image_additional_fields.items(): additional_field_shape = tensor.get_shape() additional_field_dim = len(additional_field_shape) per_image_additional_fields[key] = tf.reshape( tf.slice( per_image_additional_fields[key], additional_field_dim * [0], tf.stack([per_image_num_valid_boxes] + (additional_field_dim - 1) * [-1])), [-1] + [ shape_utils.get_dim_as_int(dim) for dim in additional_field_shape[1:] ]) if use_class_agnostic_nms: nmsed_boxlist, num_valid_nms_boxes = class_agnostic_non_max_suppression( per_image_boxes, per_image_boxes_3d, per_image_scores, score_thresh, iou_thresh, max_classes_per_detection, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, pad_to_max_output_size=use_static_shapes, additional_fields=per_image_additional_fields) else: nmsed_boxlist, num_valid_nms_boxes = multiclass_non_max_suppression( per_image_boxes, per_image_boxes_3d, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, pad_to_max_output_size=use_static_shapes, additional_fields=per_image_additional_fields) if not use_static_shapes: nmsed_boxlist = box_list_ops.pad_or_clip_box_list( nmsed_boxlist, max_total_size) num_detections = num_valid_nms_boxes nmsed_boxes = nmsed_boxlist.get() nmsed_boxes_3d = nmsed_boxlist.get_field( fields.BoxListFields.boxes_3d) nmsed_scores = nmsed_boxlist.get_field(fields.BoxListFields.scores) nmsed_classes = nmsed_boxlist.get_field( fields.BoxListFields.classes) nmsed_additional_fields = [] # Sorting is needed here to ensure that the values stored in # nmsed_additional_fields are always kept in the same order # across different execution runs. for key in sorted(per_image_additional_fields.keys()): nmsed_additional_fields.append(nmsed_boxlist.get_field(key)) return ( [nmsed_boxes, nmsed_boxes_3d, nmsed_scores, nmsed_classes] + nmsed_additional_fields + [num_detections]) num_additional_fields = 0 if ordered_additional_fields: num_additional_fields = len(ordered_additional_fields) num_nmsed_outputs = 4 + num_additional_fields batch_outputs = shape_utils.static_or_dynamic_map_fn( _single_image_nms_fn, elems=([boxes, boxes_3d, scores, clip_window] + list(ordered_additional_fields.values()) + [num_valid_boxes]), dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), parallel_iterations=parallel_iterations) batch_nmsed_boxes = batch_outputs[0] batch_nmsed_boxes_3d = batch_outputs[1] batch_nmsed_scores = batch_outputs[2] batch_nmsed_classes = batch_outputs[3] batch_nmsed_values = batch_outputs[4:-1] batch_nmsed_additional_fields = {} if num_additional_fields > 0: # Sort the keys to ensure arranging elements in same order as # in _single_image_nms_fn. batch_nmsed_keys = list(ordered_additional_fields.keys()) for i in range(len(batch_nmsed_keys)): batch_nmsed_additional_fields[ batch_nmsed_keys[i]] = batch_nmsed_values[i] batch_num_detections = batch_outputs[-1] if not ordered_additional_fields: batch_nmsed_additional_fields = None return (batch_nmsed_boxes, batch_nmsed_boxes_3d, batch_nmsed_scores, batch_nmsed_classes, batch_nmsed_additional_fields, batch_num_detections)
def graph_fn(): input_tensor = tf.constant([[1, 2], [3, 1], [0, 4]], dtype=tf.float32) return shape_utils.static_or_dynamic_map_fn(fn, input_tensor)
def loss(self, prediction_dict, true_image_shapes, scope=None): """Compute scalar loss tensors with respect to provided groundtruth. Calling this function requires that groundtruth tensors have been provided via the provide_groundtruth function. Args: prediction_dict: a dictionary holding prediction tensors with 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 2) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. scope: Optional scope name. Returns: a dictionary mapping loss keys (`localization_loss` and `classification_loss`) to scalar tensors representing corresponding loss values. """ with tf.name_scope(scope, 'Loss', prediction_dict.values()): keypoints = None if self.groundtruth_has_field(fields.BoxListFields.keypoints): keypoints = self.groundtruth_lists( fields.BoxListFields.keypoints) weights = None if self.groundtruth_has_field(fields.BoxListFields.weights): weights = self.groundtruth_lists(fields.BoxListFields.weights) (batch_cls_targets, batch_cls_weights, batch_reg_targets, batch_reg_weights, match_list) = self._assign_targets( self.groundtruth_lists(fields.BoxListFields.boxes), self.groundtruth_lists(fields.BoxListFields.classes), keypoints, weights) if self._add_summaries: self._summarize_target_assignment( self.groundtruth_lists(fields.BoxListFields.boxes), match_list) if self._random_example_sampler: batch_sampled_indicator = tf.to_float( shape_utils.static_or_dynamic_map_fn( self._minibatch_subsample_fn, [batch_cls_targets, batch_cls_weights], dtype=tf.bool, parallel_iterations=self._parallel_iterations, back_prop=True)) batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights) batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights) location_losses = self._localization_loss( prediction_dict['box_encodings'], batch_reg_targets, ignore_nan_targets=True, weights=batch_reg_weights) cls_losses = ops.reduce_sum_trailing_dimensions( self._classification_loss( prediction_dict['class_predictions_with_background'], batch_cls_targets, weights=batch_cls_weights), ndims=2) if self._hard_example_miner: (localization_loss, classification_loss) = self._apply_hard_mining( location_losses, cls_losses, prediction_dict, match_list) if self._add_summaries: self._hard_example_miner.summarize() else: if self._add_summaries: class_ids = tf.argmax(batch_cls_targets, axis=2) flattened_class_ids = tf.reshape(class_ids, [-1]) flattened_classification_losses = tf.reshape( cls_losses, [-1]) self._summarize_anchor_classification_loss( flattened_class_ids, flattened_classification_losses) localization_loss = tf.reduce_sum(location_losses) classification_loss = tf.reduce_sum(cls_losses) # Optionally normalize by number of positive matches normalizer = tf.constant(1.0, dtype=tf.float32) if self._normalize_loss_by_num_matches: normalizer = tf.maximum( tf.to_float(tf.reduce_sum(batch_reg_weights)), 1.0) localization_loss_normalizer = normalizer if self._normalize_loc_loss_by_codesize: localization_loss_normalizer *= self._box_coder.code_size localization_loss = tf.multiply((self._localization_loss_weight / localization_loss_normalizer), localization_loss, name='localization_loss') classification_loss = tf.multiply( (self._classification_loss_weight / normalizer), classification_loss, name='classification_loss') loss_dict = { str(localization_loss.op.name): localization_loss, str(classification_loss.op.name): classification_loss } return loss_dict
def result_dict_for_batched_example(images, keys, detections, groundtruth=None, class_agnostic=False, scale_to_absolute=False, original_image_spatial_shapes=None, true_image_shapes=None, max_gt_boxes=None): """Merges all detection and groundtruth information for a single example. Note that evaluation tools require classes that are 1-indexed, and so this function performs the offset. If `class_agnostic` is True, all output classes have label 1. Args: images: A single 4D uint8 image tensor of shape [batch_size, H, W, C]. keys: A [batch_size] string tensor with image identifier. detections: A dictionary of detections, returned from DetectionModel.postprocess(). groundtruth: (Optional) Dictionary of groundtruth items, with fields: 'groundtruth_boxes': [batch_size, max_number_of_boxes, 4] float32 tensor of boxes, in normalized coordinates. 'groundtruth_classes': [batch_size, max_number_of_boxes] int64 tensor of 1-indexed classes. 'groundtruth_area': [batch_size, max_number_of_boxes] float32 tensor of bbox area. (Optional) 'groundtruth_is_crowd':[batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_difficult': [batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_group_of': [batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_instance_masks': 4D int64 tensor of instance masks (Optional). class_agnostic: Boolean indicating whether the detections are class-agnostic (i.e. binary). Default False. scale_to_absolute: Boolean indicating whether boxes and keypoints should be scaled to absolute coordinates. Note that for IoU based evaluations, it does not matter whether boxes are expressed in absolute or relative coordinates. Default False. original_image_spatial_shapes: A 2D int32 tensor of shape [batch_size, 2] used to resize the image. When set to None, the image size is retained. true_image_shapes: A 2D int32 tensor of shape [batch_size, 3] containing the size of the unpadded original_image. max_gt_boxes: [batch_size] tensor representing the maximum number of groundtruth boxes to pad. Returns: A dictionary with: 'original_image': A [batch_size, H, W, C] uint8 image tensor. 'original_image_spatial_shape': A [batch_size, 2] tensor containing the original image sizes. 'true_image_shape': A [batch_size, 3] tensor containing the size of the unpadded original_image. 'key': A [batch_size] string tensor with image identifier. 'detection_boxes': [batch_size, max_detections, 4] float32 tensor of boxes, in normalized or absolute coordinates, depending on the value of `scale_to_absolute`. 'detection_scores': [batch_size, max_detections] float32 tensor of scores. 'detection_classes': [batch_size, max_detections] int64 tensor of 1-indexed classes. 'detection_masks': [batch_size, max_detections, H, W] float32 tensor of binarized masks, reframed to full image masks. 'num_detections': [batch_size] int64 tensor containing number of valid detections. 'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in normalized or absolute coordinates, depending on the value of `scale_to_absolute`. (Optional) 'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed classes. (Optional) 'groundtruth_area': [batch_size, num_boxes] float32 tensor of bbox area. (Optional) 'groundtruth_is_crowd': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_difficult': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_group_of': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_instance_masks': 4D int64 tensor of instance masks (Optional). 'num_groundtruth_boxes': [batch_size] tensor containing the maximum number of groundtruth boxes per image. Raises: ValueError: if original_image_spatial_shape is not 2D int32 tensor of shape [2]. ValueError: if true_image_shapes is not 2D int32 tensor of shape [3]. """ label_id_offset = 1 # Applying label id offset (b/63711816) input_data_fields = fields.InputDataFields if original_image_spatial_shapes is None: original_image_spatial_shapes = tf.tile( tf.expand_dims(tf.shape(images)[1:3], axis=0), multiples=[tf.shape(images)[0], 1]) else: if (len(original_image_spatial_shapes.shape) != 2 and original_image_spatial_shapes.shape[1] != 2): raise ValueError( '`original_image_spatial_shape` should be a 2D tensor of shape ' '[batch_size, 2].') if true_image_shapes is None: true_image_shapes = tf.tile(tf.expand_dims(tf.shape(images)[1:4], axis=0), multiples=[tf.shape(images)[0], 1]) else: if (len(true_image_shapes.shape) != 2 and true_image_shapes.shape[1] != 3): raise ValueError('`true_image_shapes` should be a 2D tensor of ' 'shape [batch_size, 3].') output_dict = { input_data_fields.original_image: images, input_data_fields.key: keys, input_data_fields.original_image_spatial_shape: (original_image_spatial_shapes), input_data_fields.true_image_shape: true_image_shapes } detection_fields = fields.DetectionResultFields detection_boxes = detections[detection_fields.detection_boxes] detection_scores = detections[detection_fields.detection_scores] num_detections = tf.cast(detections[detection_fields.num_detections], dtype=tf.int32) if class_agnostic: detection_classes = tf.ones_like(detection_scores, dtype=tf.int64) else: detection_classes = ( tf.to_int64(detections[detection_fields.detection_classes]) + label_id_offset) if scale_to_absolute: output_dict[detection_fields.detection_boxes] = ( shape_utils.static_or_dynamic_map_fn( _scale_box_to_absolute, elems=[detection_boxes, original_image_spatial_shapes], dtype=tf.float32)) else: output_dict[detection_fields.detection_boxes] = detection_boxes output_dict[detection_fields.detection_classes] = detection_classes output_dict[detection_fields.detection_scores] = detection_scores output_dict[detection_fields.num_detections] = num_detections if detection_fields.detection_masks in detections: detection_masks = detections[detection_fields.detection_masks] # TODO(rathodv): This should be done in model's postprocess # function ideally. output_dict[detection_fields.detection_masks] = ( shape_utils.static_or_dynamic_map_fn( _resize_detection_masks, elems=[ detection_boxes, detection_masks, original_image_spatial_shapes ], dtype=tf.uint8)) if detection_fields.detection_keypoints in detections: detection_keypoints = detections[detection_fields.detection_keypoints] output_dict[detection_fields.detection_keypoints] = detection_keypoints if scale_to_absolute: output_dict[detection_fields.detection_keypoints] = ( shape_utils.static_or_dynamic_map_fn( _scale_keypoint_to_absolute, elems=[detection_keypoints, original_image_spatial_shapes], dtype=tf.float32)) if groundtruth: if max_gt_boxes is None: if input_data_fields.num_groundtruth_boxes in groundtruth: max_gt_boxes = groundtruth[ input_data_fields.num_groundtruth_boxes] else: raise ValueError( 'max_gt_boxes must be provided when processing batched examples.' ) if input_data_fields.groundtruth_instance_masks in groundtruth: masks = groundtruth[input_data_fields.groundtruth_instance_masks] groundtruth[input_data_fields.groundtruth_instance_masks] = ( shape_utils.static_or_dynamic_map_fn( _resize_groundtruth_masks, elems=[masks, original_image_spatial_shapes], dtype=tf.uint8)) output_dict.update(groundtruth) if scale_to_absolute: groundtruth_boxes = groundtruth[ input_data_fields.groundtruth_boxes] output_dict[input_data_fields.groundtruth_boxes] = ( shape_utils.static_or_dynamic_map_fn( _scale_box_to_absolute, elems=[groundtruth_boxes, original_image_spatial_shapes], dtype=tf.float32)) # For class-agnostic models, groundtruth classes all become 1. if class_agnostic: groundtruth_classes = groundtruth[ input_data_fields.groundtruth_classes] groundtruth_classes = tf.ones_like(groundtruth_classes, dtype=tf.int64) output_dict[ input_data_fields.groundtruth_classes] = groundtruth_classes output_dict[input_data_fields.num_groundtruth_boxes] = max_gt_boxes return output_dict
def batch_multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, num_valid_boxes=None, masks=None, additional_fields=None, scope=None, parallel_iterations=32): """Multi-class version of non maximum suppression that operates on a batch. This op is similar to `multiclass_non_max_suppression` but operates on a batch of boxes and scores. See documentation for `multiclass_non_max_suppression` for details. Args: boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. scores: A [batch_size, num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of shape [batch_size, 4] where each entry is of the form [y_min, x_min, y_max, x_max] representing the window to clip boxes to before performing non-max suppression. This argument can also be a tensor of shape [4] in which case, the same clip window is applied to all images in the batch. If clip_widow is None, all boxes are used to perform non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose dimensions are [batch_size, num_anchors, ...]. scope: tf scope name. parallel_iterations: (optional) number of batch items to process in parallel. Returns: 'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [batch_size, max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [batch_size, max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [batch_size, max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) a dictionary of [batch_size, max_detections, ...] float32 tensors corresponding to the tensors specified in the input `additional_fields`. This is not returned if input `additional_fields` is None. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. Raises: ValueError: if `q` in boxes.shape is not 1 or not equal to number of classes as inferred from scores.shape. """ q = boxes.shape[2].value num_classes = scores.shape[2].value if q != 1 and q != num_classes: raise ValueError('third dimension of boxes must be either 1 or equal ' 'to the third dimension of scores') if change_coordinate_frame and clip_window is None: raise ValueError('if change_coordinate_frame is True, then a clip_window' 'must be specified.') original_masks = masks original_additional_fields = additional_fields with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): boxes_shape = boxes.shape batch_size = boxes_shape[0].value num_anchors = boxes_shape[1].value if batch_size is None: batch_size = tf.shape(boxes)[0] if num_anchors is None: num_anchors = tf.shape(boxes)[1] # If num valid boxes aren't provided, create one and mark all boxes as # valid. if num_valid_boxes is None: num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors # If masks aren't provided, create dummy masks so we can only have one copy # of _single_image_nms_fn and discard the dummy masks after map_fn. if masks is None: masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0]) masks = tf.zeros(masks_shape) if clip_window is None: clip_window = tf.stack([ tf.reduce_min(boxes[:, :, :, 0]), tf.reduce_min(boxes[:, :, :, 1]), tf.reduce_max(boxes[:, :, :, 2]), tf.reduce_max(boxes[:, :, :, 3]) ]) if clip_window.shape.ndims == 1: clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) if additional_fields is None: additional_fields = {} def _single_image_nms_fn(args): """Runs NMS on a single image and returns padded output. Args: args: A list of tensors consisting of the following: per_image_boxes - A [num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. per_image_scores - A [num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. per_image_clip_window - A 1D float32 tensor of the form [ymin, xmin, ymax, xmax] representing the window to clip the boxes to. per_image_additional_fields - (optional) A variable number of float32 tensors each with size [num_anchors, ...]. per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. Returns: 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) A variable number of float32 tensors each with size [max_detections, ...] corresponding to the input `per_image_additional_fields`. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. """ per_image_boxes = args[0] per_image_scores = args[1] per_image_masks = args[2] per_image_clip_window = args[3] per_image_additional_fields = { key: value for key, value in zip(additional_fields, args[4:-1]) } per_image_num_valid_boxes = args[-1] per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([per_image_num_valid_boxes, -1, -1, -1])), [-1, q, per_image_masks.shape[2].value, per_image_masks.shape[3].value]) if per_image_additional_fields is not None: for key, tensor in per_image_additional_fields.items(): additional_field_shape = tensor.get_shape() additional_field_dim = len(additional_field_shape) per_image_additional_fields[key] = tf.reshape( tf.slice(per_image_additional_fields[key], additional_field_dim * [0], tf.stack([per_image_num_valid_boxes] + (additional_field_dim - 1) * [-1])), [-1] + [dim.value for dim in additional_field_shape[1:]]) nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, masks=per_image_masks, additional_fields=per_image_additional_fields) padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist, max_total_size) num_detections = nmsed_boxlist.num_boxes() nmsed_boxes = padded_boxlist.get() nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores) nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes) nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks) nmsed_additional_fields = [ padded_boxlist.get_field(key) for key in per_image_additional_fields ] return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + nmsed_additional_fields + [num_detections]) num_additional_fields = 0 if additional_fields is not None: num_additional_fields = len(additional_fields) num_nmsed_outputs = 4 + num_additional_fields batch_outputs = shape_utils.static_or_dynamic_map_fn( _single_image_nms_fn, elems=([boxes, scores, masks, clip_window] + list(additional_fields.values()) + [num_valid_boxes]), dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), parallel_iterations=parallel_iterations) batch_nmsed_boxes = batch_outputs[0] batch_nmsed_scores = batch_outputs[1] batch_nmsed_classes = batch_outputs[2] batch_nmsed_masks = batch_outputs[3] batch_nmsed_additional_fields = { key: value for key, value in zip(additional_fields, batch_outputs[4:-1]) } batch_num_detections = batch_outputs[-1] if original_masks is None: batch_nmsed_masks = None if original_additional_fields is None: batch_nmsed_additional_fields = None return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, batch_nmsed_masks, batch_nmsed_additional_fields, batch_num_detections)
def batch_multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, num_valid_boxes=None, masks=None, additional_fields=None, scope=None, parallel_iterations=32): """Multi-class version of non maximum suppression that operates on a batch. This op is similar to `multiclass_non_max_suppression` but operates on a batch of boxes and scores. See documentation for `multiclass_non_max_suppression` for details. Args: boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. scores: A [batch_size, num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of shape [batch_size, 4] where each entry is of the form [y_min, x_min, y_max, x_max] representing the window to clip boxes to before performing non-max suppression. This argument can also be a tensor of shape [4] in which case, the same clip window is applied to all images in the batch. If clip_widow is None, all boxes are used to perform non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose dimensions are [batch_size, num_anchors, ...]. scope: tf scope name. parallel_iterations: (optional) number of batch items to process in parallel. Returns: 'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [batch_size, max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [batch_size, max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [batch_size, max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) a dictionary of [batch_size, max_detections, ...] float32 tensors corresponding to the tensors specified in the input `additional_fields`. This is not returned if input `additional_fields` is None. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. Raises: ValueError: if `q` in boxes.shape is not 1 or not equal to number of classes as inferred from scores.shape. """ q = boxes.shape[2].value num_classes = scores.shape[2].value if q != 1 and q != num_classes: raise ValueError('third dimension of boxes must be either 1 or equal ' 'to the third dimension of scores') if change_coordinate_frame and clip_window is None: raise ValueError('if change_coordinate_frame is True, then a clip_window' 'must be specified.') original_masks = masks original_additional_fields = additional_fields with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): boxes_shape = boxes.shape batch_size = boxes_shape[0].value num_anchors = boxes_shape[1].value if batch_size is None: batch_size = tf.shape(boxes)[0] if num_anchors is None: num_anchors = tf.shape(boxes)[1] # If num valid boxes aren't provided, create one and mark all boxes as # valid. if num_valid_boxes is None: num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors # If masks aren't provided, create dummy masks so we can only have one copy # of _single_image_nms_fn and discard the dummy masks after map_fn. if masks is None: masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0]) masks = tf.zeros(masks_shape) if clip_window is None: clip_window = tf.stack([ tf.reduce_min(boxes[:, :, :, 0]), tf.reduce_min(boxes[:, :, :, 1]), tf.reduce_max(boxes[:, :, :, 2]), tf.reduce_max(boxes[:, :, :, 3]) ]) if clip_window.shape.ndims == 1: clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) if additional_fields is None: additional_fields = {} def _single_image_nms_fn(args): """Runs NMS on a single image and returns padded output. Args: args: A list of tensors consisting of the following: per_image_boxes - A [num_anchors, q, 4] float32 tensor containing detections. If `q` is 1 then same boxes are used for all classes otherwise, if `q` is equal to number of classes, class-specific boxes are used. per_image_scores - A [num_anchors, num_classes] float32 tensor containing the scores for each of the `num_anchors` detections. per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. per_image_clip_window - A 1D float32 tensor of the form [ymin, xmin, ymax, xmax] representing the window to clip the boxes to. per_image_additional_fields - (optional) A variable number of float32 tensors each with size [num_anchors, ...]. per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of shape [batch_size] representing the number of valid boxes to be considered for each image in the batch. This parameter allows for ignoring zero paddings. Returns: 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the non-max suppressed boxes. 'nmsed_scores': A [max_detections] float32 tensor containing the scores for the boxes. 'nmsed_classes': A [max_detections] float32 tensor containing the class for boxes. 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] float32 tensor containing masks for each selected box. This is set to None if input `masks` is None. 'nmsed_additional_fields': (optional) A variable number of float32 tensors each with size [max_detections, ...] corresponding to the input `per_image_additional_fields`. 'num_detections': A [batch_size] int32 tensor indicating the number of valid detections per batch item. Only the top num_detections[i] entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the entries are zero paddings. """ per_image_boxes = args[0] per_image_scores = args[1] per_image_masks = args[2] per_image_clip_window = args[3] per_image_additional_fields = { key: value for key, value in zip(additional_fields, args[4:-1]) } per_image_num_valid_boxes = args[-1] per_image_boxes = tf.reshape( tf.slice(per_image_boxes, 3 * [0], tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) per_image_scores = tf.reshape( tf.slice(per_image_scores, [0, 0], tf.stack([per_image_num_valid_boxes, -1])), [-1, num_classes]) per_image_masks = tf.reshape( tf.slice(per_image_masks, 4 * [0], tf.stack([per_image_num_valid_boxes, -1, -1, -1])), [-1, q, per_image_masks.shape[2].value, per_image_masks.shape[3].value]) if per_image_additional_fields is not None: for key, tensor in per_image_additional_fields.items(): additional_field_shape = tensor.get_shape() additional_field_dim = len(additional_field_shape) per_image_additional_fields[key] = tf.reshape( tf.slice(per_image_additional_fields[key], additional_field_dim * [0], tf.stack([per_image_num_valid_boxes] + (additional_field_dim - 1) * [-1])), [-1] + [dim.value for dim in additional_field_shape[1:]]) nmsed_boxlist = multiclass_non_max_suppression( per_image_boxes, per_image_scores, #score_thresh, iou_thresh, max_size_per_class, max_total_size, clip_window=per_image_clip_window, change_coordinate_frame=change_coordinate_frame, masks=per_image_masks, additional_fields=per_image_additional_fields) padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist, max_total_size) num_detections = nmsed_boxlist.num_boxes() nmsed_boxes = padded_boxlist.get() nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores) nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes) nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks) nmsed_additional_fields = [ padded_boxlist.get_field(key) for key in per_image_additional_fields ] return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + nmsed_additional_fields + [num_detections]) num_additional_fields = 0 if additional_fields is not None: num_additional_fields = len(additional_fields) num_nmsed_outputs = 4 + num_additional_fields batch_outputs = shape_utils.static_or_dynamic_map_fn( _single_image_nms_fn, elems=([boxes, scores, masks, clip_window] + list(additional_fields.values()) + [num_valid_boxes]), dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), parallel_iterations=parallel_iterations) batch_nmsed_boxes = batch_outputs[0] batch_nmsed_scores = batch_outputs[1] batch_nmsed_classes = batch_outputs[2] batch_nmsed_masks = batch_outputs[3] batch_nmsed_additional_fields = { key: value for key, value in zip(additional_fields, batch_outputs[4:-1]) } batch_num_detections = batch_outputs[-1] if original_masks is None: batch_nmsed_masks = None if original_additional_fields is None: batch_nmsed_additional_fields = None return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, batch_nmsed_masks, batch_nmsed_additional_fields, batch_num_detections)
def filter_bbox(proposal_boxes, filter_boxes, proposal_scores, proposal_classes=None, filter_threshold=0.5, min_number=1, max_number=None): """Returns proposal_boxes_result and validation. Args: proposal_boxes: A float tensor with shape [batch_size, num_proposals, 4] representing the (potentially zero padded) proposal boxes for all images in the batch. the format is such as [x1,y2,x2,y2] proposal_scores: A float tensor with shape [batch_size, num_proposals, num_class] representing the (potentially zero added) proposal boxes for all images in the batch. filter_boxes: A float tensor with shape [batch_size, num_filter, 4] representing the filter proposal boxes (potentially zero padded) proposal boxes in the batch. filter_threshold: A float number for threshold in iou' Returns: proposal_boxes_result: A float tensor with shape [batch_size, num_proposals, 4] representing the (potentially zero padded) proposal boxes for all images in the batch. validation: A float tensor with shape [batch_size,] representing the vaildative proposal boxes for the batch. """ proposal_boxes_shape = shape_utils.combined_static_and_dynamic_shape(proposal_boxes) proposal_areas = tf.multiply(tf.maximum(0.0, proposal_boxes[..., 2] - proposal_boxes[..., 0]), tf.maximum(0.0, proposal_boxes[..., 3] - proposal_boxes[..., 1])) proposal_boxes = tf.expand_dims(proposal_boxes, axis=2) filter_boxes = tf.expand_dims(filter_boxes, axis=1) x1 = tf.maximum(proposal_boxes[..., 0], filter_boxes[..., 0]) y1 = tf.maximum(proposal_boxes[..., 1], filter_boxes[..., 1]) x2 = tf.minimum(proposal_boxes[..., 2], filter_boxes[..., 2]) y2 = tf.minimum(proposal_boxes[..., 3], filter_boxes[..., 3]) proposal_boxes = tf.squeeze(proposal_boxes, axis=2) filter_boxes = tf.squeeze(filter_boxes, axis=1) proposal_bg_areas = tf.multiply(tf.maximum(0.0, x2 - x1), tf.maximum(0.0, y2 - y1)) proposal_bg_ious = tf.reduce_sum(proposal_bg_areas, axis=2) proposal_bg_areas_iou = tf.div(proposal_bg_ious, tf.where(tf.equal(proposal_areas, 0), tf.ones_like(proposal_areas), proposal_areas)[ 0]) keeps = tf.greater_equal(proposal_bg_areas_iou, filter_threshold) validation = tf.reduce_sum(tf.to_int32(keeps), axis=1) max_numbers = tf.reduce_max(validation) max_numbers = tf.maximum(max_numbers, min_number) def _per_batch_gather_padding(args): """Returns proposal_boxes_result, proposal_score_result and validation. Args: args[0]: boxes. A float tensor with shape [num_proposals, 4] representing the (potentially zero padded) proposal boxes for all images in the batch. the format is such as [x1,y2,x2,y2] args[1]: scores. A float tensor with shape [num_proposals, num_class] representing the (potentially zero added) proposal boxes for all images in the batch. args[2]: keep. A bool tensor with shape [num_proposals, 1] representing the coordination needing to keep with true. args[3]: max_number. A int tensor with shape [,] representing the kept proposal's number with padding zeros Returns: proposal_boxes_result: A float tensor with shape [max_number, 4] representing the (potentially zero padded) proposal boxes for all images in the batch. proposal_score_result: A float tensor with shape [max_number, num_class] representing the (potentially zero padded) proposal boxes for all images in the batch. """ boxes = args[0] scores = args[1] keep = args[2] if proposal_classes is not None: classes = args[3] result_indice = tf.where(keep) result_indice = tf.squeeze(result_indice, axis=-1) boxes_result = tf.gather(boxes, result_indice) boxes_result = shape_utils.pad_or_clip_tensor( boxes_result, max_numbers) score_result = tf.gather(scores, result_indice) score_result = shape_utils.pad_or_clip_tensor( score_result, max_numbers) result = [boxes_result, score_result] if proposal_classes is not None: class_result = tf.gather(classes, result_indice) class_result = shape_utils.pad_or_clip_tensor( class_result, max_numbers) result.append(class_result) return result if proposal_classes is not None: proposal_boxes_result, proposal_score_result, proposal_class_result = shape_utils.static_or_dynamic_map_fn( _per_batch_gather_padding, [proposal_boxes, proposal_scores, keeps, proposal_classes], dtype=[tf.float32, tf.float32, tf.float32]) # test using # return proposal_boxes_result, vaildation, proposal_bg_areas,proposal_bg_ious,proposal_bg_areas_iou return proposal_boxes_result, proposal_score_result, proposal_class_result, validation, max_numbers # change ??? else: proposal_boxes_result, proposal_score_result = shape_utils.static_or_dynamic_map_fn(_per_batch_gather_padding, [proposal_boxes, proposal_scores, keeps]) # test using # return proposal_boxes_result, vaildation, proposal_bg_areas,proposal_bg_ious,proposal_bg_areas_iou return proposal_boxes_result, proposal_score_result, validation, max_numbers # change ???
def graph_fn(input_tensor): return shape_utils.static_or_dynamic_map_fn(fn, input_tensor)
def prune_outside_window(dp_num_points, dp_part_ids, dp_surface_coords, window, scope=None): """Prunes DensePose points that fall outside a given window. This function replaces points that fall outside the given window with zeros. See also clip_to_window which clips any DensePose points that fall outside the given window. Note that this operation uses dynamic shapes, and therefore is not currently suitable for TPU. Args: dp_num_points: a tensor of shape [num_instances] that indicates how many (non-padded) DensePose points there are per instance. dp_part_ids: a tensor of shape [num_instances, num_points] with DensePose part ids. These part_ids are 0-indexed, where the first non-background part has index 0. dp_surface_coords: a tensor of shape [num_instances, num_points, 4] with DensePose surface coordinates in (y, x, v, u) format. window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] window outside of which the op should prune the points. scope: name scope. Returns: new_dp_num_points: a tensor of shape [num_instances] that indicates how many (non-padded) DensePose points there are per instance after pruning. new_dp_part_ids: a tensor of shape [num_instances, num_points] with DensePose part ids. These part_ids are 0-indexed, where the first non-background part has index 0. new_dp_surface_coords: a tensor of shape [num_instances, num_points, 4] with DensePose surface coordinates after pruning. """ with tf.name_scope(scope, 'DensePosePruneOutsideWindow'): y, x, _, _ = tf.unstack(dp_surface_coords, axis=-1) win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) num_instances, num_points = shape_utils.combined_static_and_dynamic_shape( dp_part_ids) dp_num_points_tiled = tf.tile(dp_num_points[:, tf.newaxis], multiples=[1, num_points]) range_tiled = tf.tile(tf.range(num_points)[tf.newaxis, :], multiples=[num_instances, 1]) valid_initial = range_tiled < dp_num_points_tiled valid_in_window = tf.logical_and( tf.logical_and(y >= win_y_min, y <= win_y_max), tf.logical_and(x >= win_x_min, x <= win_x_max)) valid_indices = tf.logical_and(valid_initial, valid_in_window) new_dp_num_points = tf.math.reduce_sum(tf.cast(valid_indices, tf.int32), axis=1) max_num_points = tf.math.reduce_max(new_dp_num_points) def gather_and_reshuffle(elems): dp_part_ids, dp_surface_coords, valid_indices = elems locs = tf.where(valid_indices)[:, 0] valid_part_ids = tf.gather(dp_part_ids, locs, axis=0) valid_part_ids_padded = shape_utils.pad_or_clip_nd( valid_part_ids, output_shape=[max_num_points]) valid_surface_coords = tf.gather(dp_surface_coords, locs, axis=0) valid_surface_coords_padded = shape_utils.pad_or_clip_nd( valid_surface_coords, output_shape=[max_num_points, 4]) return [valid_part_ids_padded, valid_surface_coords_padded] new_dp_part_ids, new_dp_surface_coords = ( shape_utils.static_or_dynamic_map_fn( gather_and_reshuffle, elems=[dp_part_ids, dp_surface_coords, valid_indices], dtype=[tf.int32, tf.float32], back_prop=False)) return new_dp_num_points, new_dp_part_ids, new_dp_surface_coords
def result_dict_for_batched_example(images, keys, detections, groundtruth=None, class_agnostic=False, scale_to_absolute=False, original_image_spatial_shapes=None, true_image_shapes=None, max_gt_boxes=None): """Merges all detection and groundtruth information for a single example. Note that evaluation tools require classes that are 1-indexed, and so this function performs the offset. If `class_agnostic` is True, all output classes have label 1. Args: images: A single 4D uint8 image tensor of shape [batch_size, H, W, C]. keys: A [batch_size] string tensor with image identifier. detections: A dictionary of detections, returned from DetectionModel.postprocess(). groundtruth: (Optional) Dictionary of groundtruth items, with fields: 'groundtruth_boxes': [batch_size, max_number_of_boxes, 4] float32 tensor of boxes, in normalized coordinates. 'groundtruth_classes': [batch_size, max_number_of_boxes] int64 tensor of 1-indexed classes. 'groundtruth_area': [batch_size, max_number_of_boxes] float32 tensor of bbox area. (Optional) 'groundtruth_is_crowd':[batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_difficult': [batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_group_of': [batch_size, max_number_of_boxes] int64 tensor. (Optional) 'groundtruth_instance_masks': 4D int64 tensor of instance masks (Optional). class_agnostic: Boolean indicating whether the detections are class-agnostic (i.e. binary). Default False. scale_to_absolute: Boolean indicating whether boxes and keypoints should be scaled to absolute coordinates. Note that for IoU based evaluations, it does not matter whether boxes are expressed in absolute or relative coordinates. Default False. original_image_spatial_shapes: A 2D int32 tensor of shape [batch_size, 2] used to resize the image. When set to None, the image size is retained. true_image_shapes: A 2D int32 tensor of shape [batch_size, 3] containing the size of the unpadded original_image. max_gt_boxes: [batch_size] tensor representing the maximum number of groundtruth boxes to pad. Returns: A dictionary with: 'original_image': A [batch_size, H, W, C] uint8 image tensor. 'original_image_spatial_shape': A [batch_size, 2] tensor containing the original image sizes. 'true_image_shape': A [batch_size, 3] tensor containing the size of the unpadded original_image. 'key': A [batch_size] string tensor with image identifier. 'detection_boxes': [batch_size, max_detections, 4] float32 tensor of boxes, in normalized or absolute coordinates, depending on the value of `scale_to_absolute`. 'detection_scores': [batch_size, max_detections] float32 tensor of scores. 'detection_classes': [batch_size, max_detections] int64 tensor of 1-indexed classes. 'detection_masks': [batch_size, max_detections, H, W] float32 tensor of binarized masks, reframed to full image masks. 'num_detections': [batch_size] int64 tensor containing number of valid detections. 'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in normalized or absolute coordinates, depending on the value of `scale_to_absolute`. (Optional) 'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed classes. (Optional) 'groundtruth_area': [batch_size, num_boxes] float32 tensor of bbox area. (Optional) 'groundtruth_is_crowd': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_difficult': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_group_of': [batch_size, num_boxes] int64 tensor. (Optional) 'groundtruth_instance_masks': 4D int64 tensor of instance masks (Optional). 'num_groundtruth_boxes': [batch_size] tensor containing the maximum number of groundtruth boxes per image. Raises: ValueError: if original_image_spatial_shape is not 2D int32 tensor of shape [2]. ValueError: if true_image_shapes is not 2D int32 tensor of shape [3]. """ label_id_offset = 1 # Applying label id offset (b/63711816) input_data_fields = fields.InputDataFields if original_image_spatial_shapes is None: original_image_spatial_shapes = tf.tile( tf.expand_dims(tf.shape(images)[1:3], axis=0), multiples=[tf.shape(images)[0], 1]) else: if (len(original_image_spatial_shapes.shape) != 2 and original_image_spatial_shapes.shape[1] != 2): raise ValueError( '`original_image_spatial_shape` should be a 2D tensor of shape ' '[batch_size, 2].') if true_image_shapes is None: true_image_shapes = tf.tile( tf.expand_dims(tf.shape(images)[1:4], axis=0), multiples=[tf.shape(images)[0], 1]) else: if (len(true_image_shapes.shape) != 2 and true_image_shapes.shape[1] != 3): raise ValueError('`true_image_shapes` should be a 2D tensor of ' 'shape [batch_size, 3].') output_dict = { input_data_fields.original_image: images, input_data_fields.key: keys, input_data_fields.original_image_spatial_shape: ( original_image_spatial_shapes), input_data_fields.true_image_shape: true_image_shapes } detection_fields = fields.DetectionResultFields detection_boxes = detections[detection_fields.detection_boxes] detection_scores = detections[detection_fields.detection_scores] num_detections = tf.to_int32(detections[detection_fields.num_detections]) if class_agnostic: detection_classes = tf.ones_like(detection_scores, dtype=tf.int64) else: detection_classes = ( tf.to_int64(detections[detection_fields.detection_classes]) + label_id_offset) if scale_to_absolute: output_dict[detection_fields.detection_boxes] = ( shape_utils.static_or_dynamic_map_fn( _scale_box_to_absolute, elems=[detection_boxes, original_image_spatial_shapes], dtype=tf.float32)) else: output_dict[detection_fields.detection_boxes] = detection_boxes output_dict[detection_fields.detection_classes] = detection_classes output_dict[detection_fields.detection_scores] = detection_scores output_dict[detection_fields.num_detections] = num_detections if detection_fields.detection_masks in detections: detection_masks = detections[detection_fields.detection_masks] # TODO(rathodv): This should be done in model's postprocess # function ideally. output_dict[detection_fields.detection_masks] = ( shape_utils.static_or_dynamic_map_fn( _resize_detection_masks, elems=[detection_boxes, detection_masks, original_image_spatial_shapes], dtype=tf.uint8)) if detection_fields.detection_keypoints in detections: detection_keypoints = detections[detection_fields.detection_keypoints] output_dict[detection_fields.detection_keypoints] = detection_keypoints if scale_to_absolute: output_dict[detection_fields.detection_keypoints] = ( shape_utils.static_or_dynamic_map_fn( _scale_keypoint_to_absolute, elems=[detection_keypoints, original_image_spatial_shapes], dtype=tf.float32)) if groundtruth: if max_gt_boxes is None: if input_data_fields.num_groundtruth_boxes in groundtruth: max_gt_boxes = groundtruth[input_data_fields.num_groundtruth_boxes] else: raise ValueError( 'max_gt_boxes must be provided when processing batched examples.') if input_data_fields.groundtruth_instance_masks in groundtruth: masks = groundtruth[input_data_fields.groundtruth_instance_masks] groundtruth[input_data_fields.groundtruth_instance_masks] = ( shape_utils.static_or_dynamic_map_fn( _resize_groundtruth_masks, elems=[masks, original_image_spatial_shapes], dtype=tf.uint8)) output_dict.update(groundtruth) if scale_to_absolute: groundtruth_boxes = groundtruth[input_data_fields.groundtruth_boxes] output_dict[input_data_fields.groundtruth_boxes] = ( shape_utils.static_or_dynamic_map_fn( _scale_box_to_absolute, elems=[groundtruth_boxes, original_image_spatial_shapes], dtype=tf.float32)) # For class-agnostic models, groundtruth classes all become 1. if class_agnostic: groundtruth_classes = groundtruth[input_data_fields.groundtruth_classes] groundtruth_classes = tf.ones_like(groundtruth_classes, dtype=tf.int64) output_dict[input_data_fields.groundtruth_classes] = groundtruth_classes output_dict[input_data_fields.num_groundtruth_boxes] = max_gt_boxes return output_dict
def loss(self, prediction_dict, true_image_shapes, scope=None): """Compute scalar loss tensors with respect to provided groundtruth. Calling this function requires that groundtruth tensors have been provided via the provide_groundtruth function. Args: prediction_dict: a dictionary holding prediction tensors with 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 2) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. scope: Optional scope name. Returns: a dictionary mapping loss keys (`localization_loss` and `classification_loss`) to scalar tensors representing corresponding loss values. """ with tf.name_scope(scope, 'Loss', prediction_dict.values()): keypoints = None if self.groundtruth_has_field(fields.BoxListFields.keypoints): keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints) weights = None if self.groundtruth_has_field(fields.BoxListFields.weights): weights = self.groundtruth_lists(fields.BoxListFields.weights) (batch_cls_targets, batch_cls_weights, batch_reg_targets, batch_reg_weights, match_list) = self._assign_targets( self.groundtruth_lists(fields.BoxListFields.boxes), self.groundtruth_lists(fields.BoxListFields.classes), keypoints, weights) if self._add_summaries: self._summarize_target_assignment( self.groundtruth_lists(fields.BoxListFields.boxes), match_list) if self._random_example_sampler: batch_sampled_indicator = tf.to_float( shape_utils.static_or_dynamic_map_fn( self._minibatch_subsample_fn, [batch_cls_targets, batch_cls_weights], dtype=tf.bool, parallel_iterations=self._parallel_iterations, back_prop=True)) batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights) batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights) location_losses = self._localization_loss( prediction_dict['box_encodings'], batch_reg_targets, ignore_nan_targets=True, weights=batch_reg_weights) cls_losses = ops.reduce_sum_trailing_dimensions( self._classification_loss( prediction_dict['class_predictions_with_background'], batch_cls_targets, weights=batch_cls_weights), ndims=2) if self._hard_example_miner: (localization_loss, classification_loss) = self._apply_hard_mining( location_losses, cls_losses, prediction_dict, match_list) if self._add_summaries: self._hard_example_miner.summarize() else: if self._add_summaries: class_ids = tf.argmax(batch_cls_targets, axis=2) flattened_class_ids = tf.reshape(class_ids, [-1]) flattened_classification_losses = tf.reshape(cls_losses, [-1]) self._summarize_anchor_classification_loss( flattened_class_ids, flattened_classification_losses) localization_loss = tf.reduce_sum(location_losses) classification_loss = tf.reduce_sum(cls_losses) # Optionally normalize by number of positive matches normalizer = tf.constant(1.0, dtype=tf.float32) if self._normalize_loss_by_num_matches: normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)), 1.0) localization_loss_normalizer = normalizer if self._normalize_loc_loss_by_codesize: localization_loss_normalizer *= self._box_coder.code_size localization_loss = tf.multiply((self._localization_loss_weight / localization_loss_normalizer), localization_loss, name='localization_loss') classification_loss = tf.multiply((self._classification_loss_weight / normalizer), classification_loss, name='classification_loss') loss_dict = { str(localization_loss.op.name): localization_loss, str(classification_loss.op.name): classification_loss } return loss_dict
def graph_fn(input_tensor, scalar_index_tensor): map_fn_output = shape_utils.static_or_dynamic_map_fn( fn, [input_tensor, scalar_index_tensor], dtype=tf.float32) return map_fn_output
def loss(self, prediction_dict, true_image_shapes, scope=None): """Compute scalar loss tensors with respect to provided groundtruth. Calling this function requires that groundtruth tensors have been provided via the provide_groundtruth function. Args: prediction_dict: a dictionary holding prediction tensors with 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 2) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. scope: Optional scope name. Returns: a dictionary mapping loss keys (`localization_loss` and `classification_loss`) to scalar tensors representing corresponding loss values. """ with tf.name_scope(scope, 'Loss', prediction_dict.values()): keypoints = None if self.groundtruth_has_field(fields.BoxListFields.keypoints): keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints) weights = None if self.groundtruth_has_field(fields.BoxListFields.weights): weights = self.groundtruth_lists(fields.BoxListFields.weights) (batch_cls_targets, batch_cls_weights, batch_reg_targets, batch_reg_weights, match_list) = self._assign_targets( self.groundtruth_lists(fields.BoxListFields.boxes), self.groundtruth_lists(fields.BoxListFields.classes), keypoints, weights) if self._add_summaries: self._summarize_target_assignment( self.groundtruth_lists(fields.BoxListFields.boxes), match_list) if self._bbox_ignore_background_mask: # Filter using a single mask provided in ground truth. batch_size, num_anchors, box_code_dimension = \ prediction_dict['box_encodings'].get_shape() batch_ignore_masks = self.groundtruth_lists(fields.BoxListFields.masks) batch_sampled_indicator = [] print(batch_ignore_masks) for i in range(batch_size): # The single provided groundtruth mask is the ignore mask. ignore_mask = tf.cast(batch_ignore_masks[i][0], tf.float32) height, width = ignore_mask.get_shape().as_list() ignore_mask = tf.reshape(ignore_mask, [1, height, width, 1]) bboxes = prediction_dict['box_encodings'][i] bboxes = tf.minimum(tf.maximum(bboxes, 0.), 1.) # mask = tf.tile([[height, width, height, width]], [num_anchors, 1]) bboxes = tf.cast(bboxes * tf.cast([height, width, height, width], tf.float32), tf.int32) bboxes = tf.map_fn(lambda x: tf.reduce_mean(ignore_mask[x[0]:x[2], x[1]:x[3]]), bboxes, dtype=tf.float32) print(bboxes) sampled_indicator = bboxes < self._bbox_ignore_background_mask.overlap_threshold # bbox_idx = [0] * num_anchors # ignore_intersections = tf.image.crop_and_resize( # ignore_mask, bboxes, bbox_idx, (100, 100), method='nearest') # sampled_indicator = \ # tf.reduce_mean(ignore_intersections, axis=[1, 2, 3]) < \ # self._bbox_ignore_background_mask.overlap_threshold sampled_indicator = tf.cast(sampled_indicator, tf.float32) batch_sampled_indicator.append(sampled_indicator) batch_sampled_indicator = tf.stack(batch_sampled_indicator, axis=0) batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights) batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights) if self._random_example_sampler: batch_sampled_indicator = tf.to_float( shape_utils.static_or_dynamic_map_fn( self._minibatch_subsample_fn, [batch_cls_targets, batch_cls_weights], dtype=tf.bool, parallel_iterations=self._parallel_iterations, back_prop=True)) batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights) batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights) losses_mask = None if self.groundtruth_has_field(fields.InputDataFields.is_annotated): losses_mask = tf.stack(self.groundtruth_lists( fields.InputDataFields.is_annotated)) location_losses = self._localization_loss( prediction_dict['box_encodings'], batch_reg_targets, ignore_nan_targets=True, weights=batch_reg_weights, losses_mask=losses_mask) cls_losses = self._classification_loss( prediction_dict['class_predictions_with_background'], batch_cls_targets, weights=batch_cls_weights, losses_mask=losses_mask) if self._expected_classification_loss_under_sampling: if cls_losses.get_shape().ndims == 3: batch_size, num_anchors, num_classes = cls_losses.get_shape() cls_losses = tf.reshape(cls_losses, [batch_size, -1]) batch_cls_targets = tf.reshape( batch_cls_targets, [batch_size, num_anchors * num_classes, -1]) batch_cls_targets = tf.concat( [1 - batch_cls_targets, batch_cls_targets], axis=-1) cls_losses = self._expected_classification_loss_under_sampling( batch_cls_targets, cls_losses) classification_loss = tf.reduce_sum(cls_losses) localization_loss = tf.reduce_sum(location_losses) elif self._hard_example_miner: cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2) (localization_loss, classification_loss) = self._apply_hard_mining( location_losses, cls_losses, prediction_dict, match_list) if self._add_summaries: self._hard_example_miner.summarize() else: cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2) localization_loss = tf.reduce_sum(location_losses) classification_loss = tf.reduce_sum(cls_losses) # Optionally normalize by number of positive matches normalizer = tf.constant(1.0, dtype=tf.float32) if self._normalize_loss_by_num_matches: normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)), 1.0) localization_loss_normalizer = normalizer if self._normalize_loc_loss_by_codesize: localization_loss_normalizer *= self._box_coder.code_size localization_loss = tf.multiply((self._localization_loss_weight / localization_loss_normalizer), localization_loss, name='localization_loss') classification_loss = tf.multiply((self._classification_loss_weight / normalizer), classification_loss, name='classification_loss') loss_dict = { str(localization_loss.op.name): localization_loss, str(classification_loss.op.name): classification_loss } return loss_dict