def _clip_anchors_and_predictions_to_window( self, anchors_boxlist, rpn_box_encodings, rpn_objectness_predictions_with_background, clip_window): anchors_and_predictions_boxlist = box_list.BoxList( anchors_boxlist.get()) anchors_and_predictions_boxlist.add_field( "rpn_box_encodings", tf.transpose(rpn_box_encodings, [1, 0, 2])) anchors_and_predictions_boxlist.add_field( "rpn_objectness_predictions_with_background", tf.transpose(rpn_objectness_predictions_with_background, [1, 0, 2])) anchors_and_predictions_boxlist_clipped = box_list_ops.clip_to_window( anchors_and_predictions_boxlist, clip_window, filter_nonoverlapping=not self._use_static_shapes) anchors_boxlist_clipped = box_list.BoxList( anchors_and_predictions_boxlist_clipped.get()) rpn_box_encodings_clipped = tf.transpose( anchors_and_predictions_boxlist_clipped.get_field( "rpn_box_encodings"), [1, 0, 2]) rpn_objectness_predictions_with_background_clipped = tf.transpose( anchors_and_predictions_boxlist_clipped.get_field( "rpn_objectness_predictions_with_background"), [1, 0, 2]) return (anchors_boxlist_clipped, rpn_box_encodings_clipped, rpn_objectness_predictions_with_background_clipped)
def _normalize_boxlist(args): boxes, height, width = args boxes = box_list_ops.scale(boxes, stride, stride) boxes = box_list_ops.to_normalized_coordinates(boxes, height, width) boxes = box_list_ops.clip_to_window(boxes, [0., 0., 1., 1.], filter_nonoverlapping=False) return boxes
def graph_fn(): window = tf.constant([0, 0, 9, 14], tf.float32) corners = tf.constant([[5.0, 5.0, 6.0, 6.0], [-1.0, -2.0, 4.0, 5.0], [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], [-100.0, -100.0, 300.0, 600.0], [-10.0, -10.0, -9.0, -9.0]]) boxes = box_list.BoxList(corners) boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) pruned = box_list_ops.clip_to_window( boxes, window, filter_nonoverlapping=True) return pruned.get(), pruned.get_field('extra_data')
def test_clip_to_window_filter_boxes_which_fall_outside_the_window(self): window = tf.constant([0, 0, 9, 14], tf.float32) corners = tf.constant([[5.0, 5.0, 6.0, 6.0], [-1.0, -2.0, 4.0, 5.0], [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], [-100.0, -100.0, 300.0, 600.0], [-10.0, -10.0, -9.0, -9.0]]) boxes = box_list.BoxList(corners) boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) exp_output = [[5.0, 5.0, 6.0, 6.0], [0.0, 0.0, 4.0, 5.0], [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], [0.0, 0.0, 9.0, 14.0]] pruned = box_list_ops.clip_to_window(boxes, window, filter_nonoverlapping=True) with self.test_session() as sess: pruned_output = sess.run(pruned.get()) self.assertAllClose(pruned_output, exp_output) extra_data_out = sess.run(pruned.get_field('extra_data')) self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [5]])
def test_clip_to_window_without_filtering_boxes_which_fall_outside_the_window( self): window = tf.constant([0, 0, 9, 14], tf.float32) corners = tf.constant([[5.0, 5.0, 6.0, 6.0], [-1.0, -2.0, 4.0, 5.0], [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], [-100.0, -100.0, 300.0, 600.0], [-10.0, -10.0, -9.0, -9.0]]) boxes = box_list.BoxList(corners) boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) exp_output = [[5.0, 5.0, 6.0, 6.0], [0.0, 0.0, 4.0, 5.0], [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], [0.0, 0.0, 9.0, 14.0], [0.0, 0.0, 0.0, 0.0]] pruned = box_list_ops.clip_to_window( boxes, window, filter_nonoverlapping=False) with self.test_session() as sess: pruned_output = sess.run(pruned.get()) self.assertAllClose(pruned_output, exp_output) extra_data_out = sess.run(pruned.get_field('extra_data')) self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [5], [6]])
def _clip_window_prune_boxes(sorted_boxes, clip_window, pad_to_max_output_size, change_coordinate_frame): """Prune boxes with zero area. Args: sorted_boxes: A BoxList containing k detections. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. pad_to_max_output_size: flag indicating whether to pad to max output size or not. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided). Returns: sorted_boxes: A BoxList containing k detections after pruning. num_valid_nms_boxes_cumulative: Number of valid NMS boxes """ sorted_boxes = box_list_ops.clip_to_window( sorted_boxes, clip_window, filter_nonoverlapping=not pad_to_max_output_size) # Set the scores of boxes with zero area to -1 to keep the default # behaviour of pruning out zero area boxes. sorted_boxes_size = tf.shape(sorted_boxes.get())[0] non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes), tf.bool) sorted_boxes_scores = tf.where( non_zero_box_area, sorted_boxes.get_field(fields.BoxListFields.scores), -1 * tf.ones(sorted_boxes_size)) sorted_boxes.add_field(fields.BoxListFields.scores, sorted_boxes_scores) num_valid_nms_boxes_cumulative = tf.reduce_sum( tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32)) sorted_boxes = box_list_ops.sort_by_field(sorted_boxes, fields.BoxListFields.scores) if change_coordinate_frame: sorted_boxes = box_list_ops.change_coordinate_frame( sorted_boxes, clip_window) return sorted_boxes, num_valid_nms_boxes_cumulative
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, masks=None, additional_fields=None, scope=None): """Multi-class version of non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. It operates independently for each class for which scores are provided (via the scores field of the input box_list), pruning boxes with score less than a provided threshold prior to applying NMS. Please note that this operation is performed on *all* classes, therefore any background classes should be removed prior to calling this function. Args: boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either number of classes or 1 depending on whether a separate box is predicted per class. scores: A [k, num_classes] float32 tensor containing the scores for each of the k detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) masks: (optional) a [k, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose first dimensions are all of size `k`. After non-maximum suppression, all tensors corresponding to the selected boxes will be added to resulting BoxList. scope: name scope. Returns: a BoxList holding M boxes with a rank-1 scores field representing corresponding scores for each box with scores sorted in decreasing order and a rank-1 classes field representing a class label for each box. If masks, keypoints, keypoint_heatmaps is not None, the boxlist will contain masks, keypoints, keypoint_heatmaps corresponding to boxes. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ if not 0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if scores.shape.ndims != 2: raise ValueError('scores field must be of rank 2') if scores.shape[1].value is None: raise ValueError('scores must have statically defined second ' 'dimension') if boxes.shape.ndims != 3: raise ValueError('boxes must be of rank 3.') if not (boxes.shape[1].value == scores.shape[1].value or boxes.shape[1].value == 1): raise ValueError('second dimension of boxes must be either 1 or equal ' 'to the second dimension of scores') if boxes.shape[2].value != 4: raise ValueError('last dimension of boxes must be of size 4.') if change_coordinate_frame and clip_window is None: raise ValueError( 'if change_coordinate_frame is True, then a clip_window' 'must be specified.') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_boxes = tf.shape(boxes)[0] num_scores = tf.shape(scores)[0] num_classes = scores.get_shape()[1] length_assert = tf.Assert(tf.equal(num_boxes, num_scores), [ 'Incorrect scores field length: actual vs expected.', num_scores, num_boxes ]) selected_boxes_list = [] per_class_boxes_list = tf.unstack(boxes, axis=1) if masks is not None: per_class_masks_list = tf.unstack(masks, axis=1) boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else [0] * num_classes) for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): per_class_boxes = per_class_boxes_list[boxes_idx] boxlist_and_class_scores = box_list.BoxList(per_class_boxes) with tf.control_dependencies([length_assert]): class_scores = tf.reshape( tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) boxlist_and_class_scores.add_field(fields.BoxListFields.scores, class_scores) if masks is not None: per_class_masks = per_class_masks_list[boxes_idx] boxlist_and_class_scores.add_field(fields.BoxListFields.masks, per_class_masks) if additional_fields is not None: for key, tensor in additional_fields.items(): boxlist_and_class_scores.add_field(key, tensor) boxlist_filtered = box_list_ops.filter_greater_than( boxlist_and_class_scores, score_thresh) if clip_window is not None: boxlist_filtered = box_list_ops.clip_to_window( boxlist_filtered, clip_window) if change_coordinate_frame: boxlist_filtered = box_list_ops.change_coordinate_frame( boxlist_filtered, clip_window) max_selection_size = tf.minimum(max_size_per_class, boxlist_filtered.num_boxes()) selected_indices = tf.image.non_max_suppression( boxlist_filtered.get(), boxlist_filtered.get_field(fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh) nms_result = box_list_ops.gather(boxlist_filtered, selected_indices) nms_result.add_field(fields.BoxListFields.classes, (tf.zeros_like( nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) selected_boxes_list.append(nms_result) selected_boxes = box_list_ops.concatenate(selected_boxes_list) sorted_boxes = box_list_ops.sort_by_field(selected_boxes, fields.BoxListFields.scores) if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxes.num_boxes()) sorted_boxes = box_list_ops.gather(sorted_boxes, tf.range(max_total_size)) return sorted_boxes
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, masks=None, additional_fields=None, scope=None): """Multi-class version of non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. It operates independently for each class for which scores are provided (via the scores field of the input box_list), pruning boxes with score less than a provided threshold prior to applying NMS. Please note that this operation is performed on *all* classes, therefore any background classes should be removed prior to calling this function. Args: boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either number of classes or 1 depending on whether a separate box is predicted per class. scores: A [k, num_classes] float32 tensor containing the scores for each of the k detections. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) masks: (optional) a [k, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose first dimensions are all of size `k`. After non-maximum suppression, all tensors corresponding to the selected boxes will be added to resulting BoxList. scope: name scope. Returns: a BoxList holding M boxes with a rank-1 scores field representing corresponding scores for each box with scores sorted in decreasing order and a rank-1 classes field representing a class label for each box. If masks, keypoints, keypoint_heatmaps is not None, the boxlist will contain masks, keypoints, keypoint_heatmaps corresponding to boxes. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ if not 0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if scores.shape.ndims != 2: raise ValueError('scores field must be of rank 2') if scores.shape[1].value is None: raise ValueError('scores must have statically defined second ' 'dimension') if boxes.shape.ndims != 3: raise ValueError('boxes must be of rank 3.') if not (boxes.shape[1].value == scores.shape[1].value or boxes.shape[1].value == 1): raise ValueError('second dimension of boxes must be either 1 or equal ' 'to the second dimension of scores') if boxes.shape[2].value != 4: raise ValueError('last dimension of boxes must be of size 4.') if change_coordinate_frame and clip_window is None: raise ValueError('if change_coordinate_frame is True, then a clip_window' 'must be specified.') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_boxes = tf.shape(boxes)[0] num_scores = tf.shape(scores)[0] num_classes = scores.get_shape()[1] length_assert = tf.Assert( tf.equal(num_boxes, num_scores), ['Incorrect scores field length: actual vs expected.', num_scores, num_boxes]) selected_boxes_list = [] per_class_boxes_list = tf.unstack(boxes, axis=1) if masks is not None: per_class_masks_list = tf.unstack(masks, axis=1) boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else [0] * num_classes) for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): per_class_boxes = per_class_boxes_list[boxes_idx] boxlist_and_class_scores = box_list.BoxList(per_class_boxes) with tf.control_dependencies([length_assert]): class_scores = tf.reshape( tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) boxlist_and_class_scores.add_field(fields.BoxListFields.scores, class_scores) if masks is not None: per_class_masks = per_class_masks_list[boxes_idx] boxlist_and_class_scores.add_field(fields.BoxListFields.masks, per_class_masks) if additional_fields is not None: for key, tensor in additional_fields.items(): boxlist_and_class_scores.add_field(key, tensor) boxlist_filtered = box_list_ops.filter_greater_than( boxlist_and_class_scores, score_thresh) if clip_window is not None: boxlist_filtered = box_list_ops.clip_to_window( boxlist_filtered, clip_window) if change_coordinate_frame: boxlist_filtered = box_list_ops.change_coordinate_frame( boxlist_filtered, clip_window) max_selection_size = tf.minimum(max_size_per_class, boxlist_filtered.num_boxes()) selected_indices = tf.image.non_max_suppression( boxlist_filtered.get(), boxlist_filtered.get_field(fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh) nms_result = box_list_ops.gather(boxlist_filtered, selected_indices) nms_result.add_field( fields.BoxListFields.classes, (tf.zeros_like( nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) selected_boxes_list.append(nms_result) selected_boxes = box_list_ops.concatenate(selected_boxes_list) sorted_boxes = box_list_ops.sort_by_field(selected_boxes, fields.BoxListFields.scores) if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxes.num_boxes()) sorted_boxes = box_list_ops.gather(sorted_boxes, tf.range(max_total_size)) return sorted_boxes
def predict(self, preprocessed_inputs): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield "raw" un-postprocessed predictions. If `first_stage_only` is True, this function only returns first stage RPN predictions (un-postprocessed). Otherwise it returns both first stage RPN predictions as well as second stage box classifier predictions. Other remarks: + Anchor pruning vs. clipping: following the recommendation of the Faster R-CNN paper, we prune anchors that venture outside the image window at training time and clip anchors to the image window at inference time. + Proposal padding: as described at the top of the file, proposals are padded to self._max_num_proposals and flattened so that proposals from all images within the input batch are arranged along the same batch dimension. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) rpn_box_predictor_features: A 4-D float32 tensor with shape [batch_size, height, width, depth] to be used for predicting proposal boxes and corresponding objectness scores. 2) rpn_features_to_crop: A 4-D float32 tensor with shape [batch_size, height, width, depth] representing image features to crop using the proposal boxes predicted by the RPN. 3) image_shape: a 1-D tensor of shape [4] representing the input image shape. 4) rpn_box_encodings: 3-D float tensor of shape [batch_size, num_anchors, self._box_coder.code_size] containing predicted boxes. 5) rpn_objectness_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, 2] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 6) anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors for the first stage RPN (in absolute coordinates). Note that `num_anchors` can differ depending on whether the model is created in training or inference mode. (and if first_stage_only=False): 7) refined_box_encodings: a 3-D tensor with shape [total_num_proposals, num_classes, 4] representing predicted (final) refined box encodings, where total_num_proposals=batch_size*self._max_num_proposals 8) class_predictions_with_background: a 3-D tensor with shape [total_num_proposals, num_classes + 1] containing class predictions (logits) for each of the anchors, where total_num_proposals=batch_size*self._max_num_proposals. Note that this tensor *includes* background class predictions (at class index 0). 9) num_proposals: An int32 tensor of shape [batch_size] representing the number of proposals generated by the RPN. `num_proposals` allows us to keep track of which entries are to be treated as zero paddings and which are not since we always pad the number of proposals to be `self.max_num_proposals` for each image. 10) proposal_boxes: A float32 tensor of shape [batch_size, self.max_num_proposals, 4] representing decoded proposal bounding boxes (in absolute coordinates). 11) mask_predictions: (optional) a 4-D tensor with shape [total_num_padded_proposals, num_classes, mask_height, mask_width] containing instance mask predictions. """ (rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist, image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs) (rpn_box_encodings, rpn_objectness_predictions_with_background ) = self._predict_rpn_proposals(rpn_box_predictor_features) # The Faster R-CNN paper recommends pruning anchors that venture outside # the image window at training time and clipping at inference time. clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]])) if self._is_training: (rpn_box_encodings, rpn_objectness_predictions_with_background, anchors_boxlist) = self._remove_invalid_anchors_and_predictions( rpn_box_encodings, rpn_objectness_predictions_with_background, anchors_boxlist, clip_window) else: anchors_boxlist = box_list_ops.clip_to_window( anchors_boxlist, clip_window) anchors = anchors_boxlist.get() prediction_dict = { 'rpn_box_predictor_features': rpn_box_predictor_features, 'rpn_features_to_crop': rpn_features_to_crop, 'image_shape': image_shape, 'rpn_box_encodings': rpn_box_encodings, 'rpn_objectness_predictions_with_background': rpn_objectness_predictions_with_background, 'anchors': anchors } if not self._first_stage_only: prediction_dict.update(self._predict_second_stage( rpn_box_encodings, rpn_objectness_predictions_with_background, rpn_features_to_crop, anchors, image_shape)) return prediction_dict
def _generate(self, feature_map_shape_list, im_height=1, im_width=1, anchor_strides=None, anchor_offsets=None): """Generates a collection of bounding boxes to be used as anchors. The number of anchors generated for a single grid with shape MxM where we place k boxes over each grid center is k*M^2 and thus the total number of anchors is the sum over all grids. In our box_specs_list example (see the constructor docstring), we would place two boxes over each grid point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the output anchors follows the order of how the grid sizes and box_specs are specified (with box_spec index varying the fastest, followed by width index, then height index, then grid index). Args: feature_map_shape_list: list of pairs of convnet layer resolutions in the format [(height_0, width_0), (height_1, width_1), ...]. For example, setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that correspond to an 8x8 layer followed by a 7x7 layer. im_height: the height of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to normalized coordinates, otherwise absolute coordinates are used for the grid. im_width: the width of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to normalized coordinates, otherwise absolute coordinates are used for the grid. anchor_strides: list of pairs of strides (in y and x directions respectively). For example, setting anchor_strides=[(.25, .25), (.5, .5)] means that we want the anchors corresponding to the first layer to be strided by .25 and those in the second layer to be strided by .5 in both y and x directions. By default, if anchor_strides=None, then they are set to be the reciprocal of the corresponding grid sizes. The pairs can also be specified as dynamic tf.int or tf.float numbers, e.g. for variable shape input images. anchor_offsets: list of pairs of offsets (in y and x directions respectively). The offset specifies where we want the center of the (0, 0)-th anchor to lie for each layer. For example, setting anchor_offsets=[(.125, .125), (.25, .25)]) means that we want the (0, 0)-th anchor of the first layer to lie at (.125, .125) in image space and likewise that we want the (0, 0)-th anchor of the second layer to lie at (.25, .25) in image space. By default, if anchor_offsets=None, then they are set to be half of the corresponding anchor stride. The pairs can also be specified as dynamic tf.int or tf.float numbers, e.g. for variable shape input images. Returns: boxes: a BoxList holding a collection of N anchor boxes Raises: ValueError: if feature_map_shape_list, box_specs_list do not have the same length. ValueError: if feature_map_shape_list does not consist of pairs of integers """ if not (isinstance(feature_map_shape_list, list) and len(feature_map_shape_list) == len(self._box_specs)): raise ValueError( 'feature_map_shape_list must be a list with the same ' 'length as self._box_specs') if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in feature_map_shape_list ]): raise ValueError('feature_map_shape_list must be a list of pairs.') if not anchor_strides: anchor_strides = [(tf.to_float(im_height) / tf.to_float(pair[0]), tf.to_float(im_width) / tf.to_float(pair[1])) for pair in feature_map_shape_list] if not anchor_offsets: anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides] for arg, arg_name in zip([anchor_strides, anchor_offsets], ['anchor_strides', 'anchor_offsets']): if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): raise ValueError('%s must be a list with the same length ' 'as self._box_specs' % arg_name) if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in arg ]): raise ValueError('%s must be a list of pairs.' % arg_name) anchor_grid_list = [] min_im_shape = tf.to_float(tf.minimum(im_height, im_width)) base_anchor_size = min_im_shape * self._base_anchor_size for grid_size, scales, aspect_ratios, stride, offset in zip( feature_map_shape_list, self._scales, self._aspect_ratios, anchor_strides, anchor_offsets): anchor_grid_list.append( grid_anchor_generator.tile_anchors( grid_height=grid_size[0], grid_width=grid_size[1], scales=scales, aspect_ratios=aspect_ratios, base_anchor_size=base_anchor_size, anchor_stride=stride, anchor_offset=offset)) concatenated_anchors = box_list_ops.concatenate(anchor_grid_list) num_anchors = concatenated_anchors.num_boxes_static() if num_anchors is None: num_anchors = concatenated_anchors.num_boxes() if self._clip_window is not None: clip_window = tf.multiply( tf.to_float([im_height, im_width, im_height, im_width]), self._clip_window) concatenated_anchors = box_list_ops.clip_to_window( concatenated_anchors, clip_window, filter_nonoverlapping=False) # TODO: make reshape an option for the clip_to_window op concatenated_anchors.set( tf.reshape(concatenated_anchors.get(), [num_anchors, 4])) stddevs_tensor = 0.01 * tf.ones( [num_anchors, 4], dtype=tf.float32, name='stddevs') concatenated_anchors.add_field('stddev', stddevs_tensor) return concatenated_anchors
def _generate(self, feature_map_shape_list, im_height=1, im_width=1): if not (isinstance(feature_map_shape_list, list) and len(feature_map_shape_list) == len(self._box_specs)): raise ValueError( 'feature_map_shape_list must be a list with the same ' 'length as self._box_specs') if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in feature_map_shape_list ]): raise ValueError('feature_map_shape_list must be a list of pairs.') im_height = tf.cast(im_height, dtype=tf.float32) im_width = tf.cast(im_width, dtype=tf.float32) if not self._anchor_strides: anchor_strides = [(1.0 / tf.cast(pair[0], dtype=tf.float32), 1.0 / tf.cast(pair[1], dtype=tf.float32)) for pair in feature_map_shape_list] else: anchor_strides = [ (tf.cast(stride[0], dtype=tf.float32) / im_height, tf.cast(stride[1], dtype=tf.float32) / im_width) for stride in self._anchor_strides ] if not self._anchor_offsets: anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides] else: anchor_offsets = [ (tf.cast(offset[0], dtype=tf.float32) / im_height, tf.cast(offset[1], dtype=tf.float32) / im_width) for offset in self._anchor_offsets ] for arg, arg_name in zip([anchor_strides, anchor_offsets], ['anchor_strides', 'anchor_offsets']): if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): raise ValueError('%s must be a list with the same length ' 'as self._box_specs' % arg_name) if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in arg ]): raise ValueError('%s must be a list of pairs.' % arg_name) anchor_grid_list = [] min_im_shape = tf.minimum(im_height, im_width) scale_height = min_im_shape / im_height scale_width = min_im_shape / im_width if not tf.contrib.framework.is_tensor(self._base_anchor_size): base_anchor_size = [ scale_height * tf.constant(self._base_anchor_size[0], dtype=tf.float32), scale_width * tf.constant(self._base_anchor_size[1], dtype=tf.float32) ] else: base_anchor_size = [ scale_height * self._base_anchor_size[0], scale_width * self._base_anchor_size[1] ] for feature_map_index, (grid_size, scales, aspect_ratios, stride, offset) in enumerate( zip(feature_map_shape_list, self._scales, self._aspect_ratios, anchor_strides, anchor_offsets)): tiled_anchors = grid_anchor_generator.tile_anchors( grid_height=grid_size[0], grid_width=grid_size[1], scales=scales, aspect_ratios=aspect_ratios, base_anchor_size=base_anchor_size, anchor_stride=stride, anchor_offset=offset) if self._clip_window is not None: tiled_anchors = box_list_ops.clip_to_window( tiled_anchors, self._clip_window, filter_nonoverlapping=False) num_anchors_in_layer = tiled_anchors.num_boxes_static() if num_anchors_in_layer is None: num_anchors_in_layer = tiled_anchors.num_boxes() anchor_indices = feature_map_index * tf.ones( [num_anchors_in_layer]) tiled_anchors.add_field('feature_map_index', anchor_indices) anchor_grid_list.append(tiled_anchors) return anchor_grid_list
def random_crop_to_aspect_ratio(image, boxes, labels, difficult=None, aspect_ratio=21. / 9., overlap_thresh=0.3): with tf.name_scope('RandomCropToAspectRatio', values=[image]): image_shape = tf.shape(image) orig_height = image_shape[0] orig_width = image_shape[1] orig_aspect_ratio = tf.to_float(orig_width) / tf.to_float(orig_height) target_aspect_ratio = tf.constant(aspect_ratio, dtype=tf.float32) def target_height_fn(): return tf.to_int32( tf.round(tf.to_float(orig_width) / target_aspect_ratio)) target_height = tf.cond(orig_aspect_ratio >= target_aspect_ratio, lambda: orig_height, target_height_fn) def target_width_fn(): return tf.to_int32( tf.round(tf.to_float(orig_height) * target_aspect_ratio)) target_width = tf.cond(orig_aspect_ratio <= target_aspect_ratio, lambda: orig_width, target_width_fn) offset_height = tf.random_uniform([], minval=0, maxval=orig_height - target_height + 1, dtype=tf.int32) offset_width = tf.random_uniform([], minval=0, maxval=orig_width - target_width + 1, dtype=tf.int32) new_image = tf.image.crop_to_bounding_box(image, offset_height, offset_width, target_height, target_width) im_box = tf.stack([ tf.to_float(offset_height) / tf.to_float(orig_height), tf.to_float(offset_width) / tf.to_float(orig_width), tf.to_float(offset_height + target_height) / tf.to_float(orig_height), tf.to_float(offset_width + target_width) / tf.to_float(orig_width) ]) boxlist = box_list.BoxList(boxes) boxlist.add_field('labels', labels) if difficult is not None: boxlist.add_field('difficult', difficult) im_boxlist = box_list.BoxList(tf.expand_dims(im_box, axis=0)) # remove boxes whose overlap with the image is less than overlap_thresh overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( boxlist, im_boxlist, overlap_thresh) # change the coordinate of the remaining boxes new_labels = overlapping_boxlist.get_field('labels') new_boxlist = box_list_ops.change_coordinate_frame( overlapping_boxlist, im_box) new_boxlist = box_list_ops.clip_to_window( new_boxlist, tf.constant([0.0, 0.0, 1.0, 1.0], tf.float32)) new_boxes = new_boxlist.get() result = [new_image, new_boxes, new_labels] if difficult is not None: new_difficult = new_boxlist.get_field('difficult') result.append(new_difficult) return tuple(result)
def _generate(self, feature_map_shape_list, im_height=1, im_width=1): """Generates a collection of bounding boxes to be used as anchors. The number of anchors generated for a single grid with shape MxM where we place k boxes over each grid center is k*M^2 and thus the total number of anchors is the sum over all grids. In our box_specs_list example (see the constructor docstring), we would place two boxes over each grid point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the output anchors follows the order of how the grid sizes and box_specs are specified (with box_spec index varying the fastest, followed by width index, then height index, then grid index). Args: feature_map_shape_list: list of pairs of convnet layer resolutions in the format [(height_0, width_0), (height_1, width_1), ...]. For example, setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that correspond to an 8x8 layer followed by a 7x7 layer. im_height: the height of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to absolute coordinates, otherwise normalized coordinates are produced. im_width: the width of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to absolute coordinates, otherwise normalized coordinates are produced. Returns: boxes_list: a list of BoxLists each holding anchor boxes corresponding to the input feature map shapes. Raises: ValueError: if feature_map_shape_list, box_specs_list do not have the same length. ValueError: if feature_map_shape_list does not consist of pairs of integers """ if not (isinstance(feature_map_shape_list, list) and len(feature_map_shape_list) == len(self._box_specs)): raise ValueError('feature_map_shape_list must be a list with the same ' 'length as self._box_specs') if not all([isinstance(list_item, tuple) and len(list_item) == 2 for list_item in feature_map_shape_list]): raise ValueError('feature_map_shape_list must be a list of pairs.') im_height = tf.to_float(im_height) im_width = tf.to_float(im_width) if not self._anchor_strides: anchor_strides = [(1.0 / tf.to_float(pair[0]), 1.0 / tf.to_float(pair[1])) for pair in feature_map_shape_list] else: anchor_strides = [(tf.to_float(stride[0]) / im_height, tf.to_float(stride[1]) / im_width) for stride in self._anchor_strides] if not self._anchor_offsets: anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides] else: anchor_offsets = [(tf.to_float(offset[0]) / im_height, tf.to_float(offset[1]) / im_width) for offset in self._anchor_offsets] for arg, arg_name in zip([anchor_strides, anchor_offsets], ['anchor_strides', 'anchor_offsets']): if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): raise ValueError('%s must be a list with the same length ' 'as self._box_specs' % arg_name) if not all([isinstance(list_item, tuple) and len(list_item) == 2 for list_item in arg]): raise ValueError('%s must be a list of pairs.' % arg_name) anchor_grid_list = [] min_im_shape = tf.minimum(im_height, im_width) scale_height = min_im_shape / im_height scale_width = min_im_shape / im_width base_anchor_size = [ scale_height * self._base_anchor_size[0], scale_width * self._base_anchor_size[1] ] for feature_map_index, (grid_size, scales, aspect_ratios, stride, offset) in enumerate( zip(feature_map_shape_list, self._scales, self._aspect_ratios, anchor_strides, anchor_offsets)): tiled_anchors = grid_anchor_generator.tile_anchors( grid_height=grid_size[0], grid_width=grid_size[1], scales=scales, aspect_ratios=aspect_ratios, base_anchor_size=base_anchor_size, anchor_stride=stride, anchor_offset=offset) if self._clip_window is not None: tiled_anchors = box_list_ops.clip_to_window( tiled_anchors, self._clip_window, filter_nonoverlapping=False) num_anchors_in_layer = tiled_anchors.num_boxes_static() if num_anchors_in_layer is None: num_anchors_in_layer = tiled_anchors.num_boxes() anchor_indices = feature_map_index * tf.ones([num_anchors_in_layer]) tiled_anchors.add_field('feature_map_index', anchor_indices) anchor_grid_list.append(tiled_anchors) return anchor_grid_list
def _generate(self, feature_map_shape_list, im_height=1, im_width=1, anchor_strides=None, anchor_offsets=None): """Generates a collection of bounding boxes to be used as anchors. The number of anchors generated for a single grid with shape MxM where we place k boxes over each grid center is k*M^2 and thus the total number of anchors is the sum over all grids. In our box_specs_list example (see the constructor docstring), we would place two boxes over each grid point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the output anchors follows the order of how the grid sizes and box_specs are specified (with box_spec index varying the fastest, followed by width index, then height index, then grid index). Args: feature_map_shape_list: list of pairs of convnet layer resolutions in the format [(height_0, width_0), (height_1, width_1), ...]. For example, setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that correspond to an 8x8 layer followed by a 7x7 layer. im_height: the height of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to normalized coordinates, otherwise absolute coordinates are used for the grid. im_width: the width of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to normalized coordinates, otherwise absolute coordinates are used for the grid. anchor_strides: list of pairs of strides (in y and x directions respectively). For example, setting anchor_strides=[(.25, .25), (.5, .5)] means that we want the anchors corresponding to the first layer to be strided by .25 and those in the second layer to be strided by .5 in both y and x directions. By default, if anchor_strides=None, then they are set to be the reciprocal of the corresponding grid sizes. The pairs can also be specified as dynamic tf.int or tf.float numbers, e.g. for variable shape input images. anchor_offsets: list of pairs of offsets (in y and x directions respectively). The offset specifies where we want the center of the (0, 0)-th anchor to lie for each layer. For example, setting anchor_offsets=[(.125, .125), (.25, .25)]) means that we want the (0, 0)-th anchor of the first layer to lie at (.125, .125) in image space and likewise that we want the (0, 0)-th anchor of the second layer to lie at (.25, .25) in image space. By default, if anchor_offsets=None, then they are set to be half of the corresponding anchor stride. The pairs can also be specified as dynamic tf.int or tf.float numbers, e.g. for variable shape input images. Returns: boxes: a BoxList holding a collection of N anchor boxes Raises: ValueError: if feature_map_shape_list, box_specs_list do not have the same length. ValueError: if feature_map_shape_list does not consist of pairs of integers """ if not (isinstance(feature_map_shape_list, list) and len(feature_map_shape_list) == len(self._box_specs)): raise ValueError('feature_map_shape_list must be a list with the same ' 'length as self._box_specs') if not all([isinstance(list_item, tuple) and len(list_item) == 2 for list_item in feature_map_shape_list]): raise ValueError('feature_map_shape_list must be a list of pairs.') if not anchor_strides: anchor_strides = [(tf.to_float(im_height) / tf.to_float(pair[0]), tf.to_float(im_width) / tf.to_float(pair[1])) for pair in feature_map_shape_list] if not anchor_offsets: anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides] for arg, arg_name in zip([anchor_strides, anchor_offsets], ['anchor_strides', 'anchor_offsets']): if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): raise ValueError('%s must be a list with the same length ' 'as self._box_specs' % arg_name) if not all([isinstance(list_item, tuple) and len(list_item) == 2 for list_item in arg]): raise ValueError('%s must be a list of pairs.' % arg_name) anchor_grid_list = [] min_im_shape = tf.to_float(tf.minimum(im_height, im_width)) base_anchor_size = min_im_shape * self._base_anchor_size for grid_size, scales, aspect_ratios, stride, offset in zip( feature_map_shape_list, self._scales, self._aspect_ratios, anchor_strides, anchor_offsets): anchor_grid_list.append( grid_anchor_generator.tile_anchors( grid_height=grid_size[0], grid_width=grid_size[1], scales=scales, aspect_ratios=aspect_ratios, base_anchor_size=base_anchor_size, anchor_stride=stride, anchor_offset=offset)) concatenated_anchors = box_list_ops.concatenate(anchor_grid_list) num_anchors = concatenated_anchors.num_boxes_static() if num_anchors is None: num_anchors = concatenated_anchors.num_boxes() if self._clip_window is not None: clip_window = tf.multiply( tf.to_float([im_height, im_width, im_height, im_width]), self._clip_window) concatenated_anchors = box_list_ops.clip_to_window( concatenated_anchors, clip_window, filter_nonoverlapping=False) # TODO: make reshape an option for the clip_to_window op concatenated_anchors.set( tf.reshape(concatenated_anchors.get(), [num_anchors, 4])) stddevs_tensor = 0.01 * tf.ones( [num_anchors, 4], dtype=tf.float32, name='stddevs') concatenated_anchors.add_field('stddev', stddevs_tensor) return concatenated_anchors
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, masks=None, boundaries=None, pad_to_max_output_size=False, additional_fields=None, scope=None): """Multi-class version of non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. It operates independently for each class for which scores are provided (via the scores field of the input box_list), pruning boxes with score less than a provided threshold prior to applying NMS. Please note that this operation is performed on *all* classes, therefore any background classes should be removed prior to calling this function. Selected boxes are guaranteed to be sorted in decreasing order by score (but the sort is not guaranteed to be stable). Args: boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either number of classes or 1 depending on whether a separate box is predicted per class. scores: A [k, num_classes] float32 tensor containing the scores for each of the k detections. The scores have to be non-negative when pad_to_max_output_size is True. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) masks: (optional) a [k, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. boundaries: (optional) a [k, q, boundary_height, boundary_width] float32 tensor containing box boundaries. `q` can be either number of classes or 1 depending on whether a separate boundary is predicted per class. pad_to_max_output_size: If true, the output nmsed boxes are padded to be of length `max_size_per_class`. Defaults to false. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose first dimensions are all of size `k`. After non-maximum suppression, all tensors corresponding to the selected boxes will be added to resulting BoxList. scope: name scope. Returns: A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a BoxList holds M boxes with a rank-1 scores field representing corresponding scores for each box with scores sorted in decreasing order and a rank-1 classes field representing a class label for each box. The num_valid_nms_boxes is a 0-D integer tensor representing the number of valid elements in `BoxList`, with the valid elements appearing first. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ if not 0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if scores.shape.ndims != 2: raise ValueError('scores field must be of rank 2') if scores.shape[1].value is None: raise ValueError('scores must have statically defined second ' 'dimension') if boxes.shape.ndims != 3: raise ValueError('boxes must be of rank 3.') if not (boxes.shape[1].value == scores.shape[1].value or boxes.shape[1].value == 1): raise ValueError('second dimension of boxes must be either 1 or equal ' 'to the second dimension of scores') if boxes.shape[2].value != 4: raise ValueError('last dimension of boxes must be of size 4.') if change_coordinate_frame and clip_window is None: raise ValueError( 'if change_coordinate_frame is True, then a clip_window' 'must be specified.') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_scores = tf.shape(scores)[0] num_classes = scores.get_shape()[1] selected_boxes_list = [] num_valid_nms_boxes_cumulative = tf.constant(0) per_class_boxes_list = tf.unstack(boxes, axis=1) if masks is not None: per_class_masks_list = tf.unstack(masks, axis=1) if boundaries is not None: per_class_boundaries_list = tf.unstack(boundaries, axis=1) boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else [0] * num_classes.value) for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): per_class_boxes = per_class_boxes_list[boxes_idx] boxlist_and_class_scores = box_list.BoxList(per_class_boxes) class_scores = tf.reshape( tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) boxlist_and_class_scores.add_field(fields.BoxListFields.scores, class_scores) if masks is not None: per_class_masks = per_class_masks_list[boxes_idx] boxlist_and_class_scores.add_field(fields.BoxListFields.masks, per_class_masks) if boundaries is not None: per_class_boundaries = per_class_boundaries_list[boxes_idx] boxlist_and_class_scores.add_field( fields.BoxListFields.boundaries, per_class_boundaries) if additional_fields is not None: for key, tensor in additional_fields.items(): boxlist_and_class_scores.add_field(key, tensor) if pad_to_max_output_size: max_selection_size = max_size_per_class selected_indices, num_valid_nms_boxes = ( tf.image.non_max_suppression_padded( boxlist_and_class_scores.get(), boxlist_and_class_scores.get_field( fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh, score_threshold=score_thresh, pad_to_max_output_size=True)) else: max_selection_size = tf.minimum( max_size_per_class, boxlist_and_class_scores.num_boxes()) selected_indices = tf.image.non_max_suppression( boxlist_and_class_scores.get(), boxlist_and_class_scores.get_field( fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh, score_threshold=score_thresh) num_valid_nms_boxes = tf.shape(selected_indices)[0] selected_indices = tf.concat([ selected_indices, tf.zeros(max_selection_size - num_valid_nms_boxes, tf.int32) ], 0) nms_result = box_list_ops.gather(boxlist_and_class_scores, selected_indices) # Make the scores -1 for invalid boxes. valid_nms_boxes_indx = tf.less(tf.range(max_selection_size), num_valid_nms_boxes) nms_scores = nms_result.get_field(fields.BoxListFields.scores) nms_result.add_field( fields.BoxListFields.scores, tf.where(valid_nms_boxes_indx, nms_scores, -1 * tf.ones(max_selection_size))) num_valid_nms_boxes_cumulative += num_valid_nms_boxes nms_result.add_field(fields.BoxListFields.classes, (tf.zeros_like( nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) selected_boxes_list.append(nms_result) selected_boxes = box_list_ops.concatenate(selected_boxes_list) sorted_boxes = box_list_ops.sort_by_field(selected_boxes, fields.BoxListFields.scores) if clip_window is not None: # When pad_to_max_output_size is False, it prunes the boxes with zero # area. sorted_boxes = box_list_ops.clip_to_window( sorted_boxes, clip_window, filter_nonoverlapping=not pad_to_max_output_size) # Set the scores of boxes with zero area to -1 to keep the default # behaviour of pruning out zero area boxes. sorted_boxes_size = tf.shape(sorted_boxes.get())[0] non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes), tf.bool) sorted_boxes_scores = tf.where( non_zero_box_area, sorted_boxes.get_field(fields.BoxListFields.scores), -1 * tf.ones(sorted_boxes_size)) sorted_boxes.add_field(fields.BoxListFields.scores, sorted_boxes_scores) num_valid_nms_boxes_cumulative = tf.reduce_sum( tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32)) sorted_boxes = box_list_ops.sort_by_field( sorted_boxes, fields.BoxListFields.scores) if change_coordinate_frame: sorted_boxes = box_list_ops.change_coordinate_frame( sorted_boxes, clip_window) if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxes.num_boxes()) sorted_boxes = box_list_ops.gather(sorted_boxes, tf.range(max_total_size)) num_valid_nms_boxes_cumulative = tf.where( max_total_size > num_valid_nms_boxes_cumulative, num_valid_nms_boxes_cumulative, max_total_size) # Select only the valid boxes if pad_to_max_output_size is False. if not pad_to_max_output_size: sorted_boxes = box_list_ops.gather( sorted_boxes, tf.range(num_valid_nms_boxes_cumulative)) return sorted_boxes, num_valid_nms_boxes_cumulative
def _generate(self, feature_map_shape_list, im_height=1, im_width=1): """Generates a collection of bounding boxes to be used as anchors. The number of anchors generated for a single grid with shape MxM where we place k boxes over each grid center is k*M^2 and thus the total number of anchors is the sum over all grids. In our box_specs_list example (see the constructor docstring), we would place two boxes over each grid point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the output anchors follows the order of how the grid sizes and box_specs are specified (with box_spec index varying the fastest, followed by width index, then height index, then grid index). Args: feature_map_shape_list: list of pairs of convnet layer resolutions in the format [(height_0, width_0), (height_1, width_1), ...]. For example, setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that correspond to an 8x8 layer followed by a 7x7 layer. im_height: the height of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to absolute coordinates, otherwise normalized coordinates are produced. im_width: the width of the image to generate the grid for. If both im_height and im_width are 1, the generated anchors default to absolute coordinates, otherwise normalized coordinates are produced. Returns: boxes_list: a list of BoxLists each holding anchor boxes corresponding to the input feature map shapes. Raises: ValueError: if feature_map_shape_list, box_specs_list do not have the same length. ValueError: if feature_map_shape_list does not consist of pairs of integers """ if not (isinstance(feature_map_shape_list, list) and len(feature_map_shape_list) == len(self._box_specs)): raise ValueError( 'feature_map_shape_list must be a list with the same ' 'length as self._box_specs') if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in feature_map_shape_list ]): raise ValueError('feature_map_shape_list must be a list of pairs.') im_height = tf.cast(im_height, dtype=tf.float32) im_width = tf.cast(im_width, dtype=tf.float32) if not self._anchor_strides: anchor_strides = [(1.0 / tf.cast(pair[0], dtype=tf.float32), 1.0 / tf.cast(pair[1], dtype=tf.float32)) for pair in feature_map_shape_list] else: anchor_strides = [ (tf.cast(stride[0], dtype=tf.float32) / im_height, tf.cast(stride[1], dtype=tf.float32) / im_width) for stride in self._anchor_strides ] if not self._anchor_offsets: anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides] else: anchor_offsets = [ (tf.cast(offset[0], dtype=tf.float32) / im_height, tf.cast(offset[1], dtype=tf.float32) / im_width) for offset in self._anchor_offsets ] for arg, arg_name in zip([anchor_strides, anchor_offsets], ['anchor_strides', 'anchor_offsets']): if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): raise ValueError('%s must be a list with the same length ' 'as self._box_specs' % arg_name) if not all([ isinstance(list_item, tuple) and len(list_item) == 2 for list_item in arg ]): raise ValueError('%s must be a list of pairs.' % arg_name) anchor_grid_list = [] min_im_shape = tf.minimum(im_height, im_width) scale_height = min_im_shape / im_height scale_width = min_im_shape / im_width if not contrib_framework.is_tensor(self._base_anchor_size): base_anchor_size = [ scale_height * tf.constant(self._base_anchor_size[0], dtype=tf.float32), scale_width * tf.constant(self._base_anchor_size[1], dtype=tf.float32) ] else: base_anchor_size = [ scale_height * self._base_anchor_size[0], scale_width * self._base_anchor_size[1] ] for feature_map_index, (grid_size, scales, aspect_ratios, stride, offset) in enumerate( zip(feature_map_shape_list, self._scales, self._aspect_ratios, anchor_strides, anchor_offsets)): tiled_anchors = grid_anchor_generator.tile_anchors( grid_height=grid_size[0], grid_width=grid_size[1], scales=scales, aspect_ratios=aspect_ratios, base_anchor_size=base_anchor_size, anchor_stride=stride, anchor_offset=offset) if self._clip_window is not None: tiled_anchors = box_list_ops.clip_to_window( tiled_anchors, self._clip_window, filter_nonoverlapping=False) num_anchors_in_layer = tiled_anchors.num_boxes_static() if num_anchors_in_layer is None: num_anchors_in_layer = tiled_anchors.num_boxes() anchor_indices = feature_map_index * tf.ones( [num_anchors_in_layer]) tiled_anchors.add_field('feature_map_index', anchor_indices) anchor_grid_list.append(tiled_anchors) return anchor_grid_list
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, change_coordinate_frame=False, masks=None, boundaries=None, pad_to_max_output_size=False, additional_fields=None, scope=None): """Multi-class version of non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. It operates independently for each class for which scores are provided (via the scores field of the input box_list), pruning boxes with score less than a provided threshold prior to applying NMS. Please note that this operation is performed on *all* classes, therefore any background classes should be removed prior to calling this function. Selected boxes are guaranteed to be sorted in decreasing order by score (but the sort is not guaranteed to be stable). Args: boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either number of classes or 1 depending on whether a separate box is predicted per class. scores: A [k, num_classes] float32 tensor containing the scores for each of the k detections. The scores have to be non-negative when pad_to_max_output_size is True. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. change_coordinate_frame: Whether to normalize coordinates after clipping relative to clip_window (this can only be set to True if a clip_window is provided) masks: (optional) a [k, q, mask_height, mask_width] float32 tensor containing box masks. `q` can be either number of classes or 1 depending on whether a separate mask is predicted per class. boundaries: (optional) a [k, q, boundary_height, boundary_width] float32 tensor containing box boundaries. `q` can be either number of classes or 1 depending on whether a separate boundary is predicted per class. pad_to_max_output_size: If true, the output nmsed boxes are padded to be of length `max_size_per_class`. Defaults to false. additional_fields: (optional) If not None, a dictionary that maps keys to tensors whose first dimensions are all of size `k`. After non-maximum suppression, all tensors corresponding to the selected boxes will be added to resulting BoxList. scope: name scope. Returns: A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a BoxList holds M boxes with a rank-1 scores field representing corresponding scores for each box with scores sorted in decreasing order and a rank-1 classes field representing a class label for each box. The num_valid_nms_boxes is a 0-D integer tensor representing the number of valid elements in `BoxList`, with the valid elements appearing first. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ if not 0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if scores.shape.ndims != 2: raise ValueError('scores field must be of rank 2') if scores.shape[1].value is None: raise ValueError('scores must have statically defined second ' 'dimension') if boxes.shape.ndims != 3: raise ValueError('boxes must be of rank 3.') if not (boxes.shape[1].value == scores.shape[1].value or boxes.shape[1].value == 1): raise ValueError('second dimension of boxes must be either 1 or equal ' 'to the second dimension of scores') if boxes.shape[2].value != 4: raise ValueError('last dimension of boxes must be of size 4.') if change_coordinate_frame and clip_window is None: raise ValueError('if change_coordinate_frame is True, then a clip_window' 'must be specified.') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_scores = tf.shape(scores)[0] num_classes = scores.get_shape()[1] selected_boxes_list = [] num_valid_nms_boxes_cumulative = tf.constant(0) per_class_boxes_list = tf.unstack(boxes, axis=1) if masks is not None: per_class_masks_list = tf.unstack(masks, axis=1) if boundaries is not None: per_class_boundaries_list = tf.unstack(boundaries, axis=1) boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else [0] * num_classes.value) for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): per_class_boxes = per_class_boxes_list[boxes_idx] boxlist_and_class_scores = box_list.BoxList(per_class_boxes) class_scores = tf.reshape( tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) boxlist_and_class_scores.add_field(fields.BoxListFields.scores, class_scores) if masks is not None: per_class_masks = per_class_masks_list[boxes_idx] boxlist_and_class_scores.add_field(fields.BoxListFields.masks, per_class_masks) if boundaries is not None: per_class_boundaries = per_class_boundaries_list[boxes_idx] boxlist_and_class_scores.add_field(fields.BoxListFields.boundaries, per_class_boundaries) if additional_fields is not None: for key, tensor in additional_fields.items(): boxlist_and_class_scores.add_field(key, tensor) if pad_to_max_output_size: max_selection_size = max_size_per_class selected_indices, num_valid_nms_boxes = ( tf.image.non_max_suppression_padded( boxlist_and_class_scores.get(), boxlist_and_class_scores.get_field(fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh, score_threshold=score_thresh, pad_to_max_output_size=True)) else: max_selection_size = tf.minimum(max_size_per_class, boxlist_and_class_scores.num_boxes()) selected_indices = tf.image.non_max_suppression( boxlist_and_class_scores.get(), boxlist_and_class_scores.get_field(fields.BoxListFields.scores), max_selection_size, iou_threshold=iou_thresh, score_threshold=score_thresh) num_valid_nms_boxes = tf.shape(selected_indices)[0] selected_indices = tf.concat( [selected_indices, tf.zeros(max_selection_size-num_valid_nms_boxes, tf.int32)], 0) nms_result = box_list_ops.gather(boxlist_and_class_scores, selected_indices) # Make the scores -1 for invalid boxes. valid_nms_boxes_indx = tf.less( tf.range(max_selection_size), num_valid_nms_boxes) nms_scores = nms_result.get_field(fields.BoxListFields.scores) nms_result.add_field(fields.BoxListFields.scores, tf.where(valid_nms_boxes_indx, nms_scores, -1*tf.ones(max_selection_size))) num_valid_nms_boxes_cumulative += num_valid_nms_boxes nms_result.add_field( fields.BoxListFields.classes, (tf.zeros_like( nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) selected_boxes_list.append(nms_result) selected_boxes = box_list_ops.concatenate(selected_boxes_list) sorted_boxes = box_list_ops.sort_by_field(selected_boxes, fields.BoxListFields.scores) if clip_window is not None: # When pad_to_max_output_size is False, it prunes the boxes with zero # area. sorted_boxes = box_list_ops.clip_to_window( sorted_boxes, clip_window, filter_nonoverlapping=not pad_to_max_output_size) # Set the scores of boxes with zero area to -1 to keep the default # behaviour of pruning out zero area boxes. sorted_boxes_size = tf.shape(sorted_boxes.get())[0] non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes), tf.bool) sorted_boxes_scores = tf.where( non_zero_box_area, sorted_boxes.get_field(fields.BoxListFields.scores), -1*tf.ones(sorted_boxes_size)) sorted_boxes.add_field(fields.BoxListFields.scores, sorted_boxes_scores) num_valid_nms_boxes_cumulative = tf.reduce_sum( tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32)) sorted_boxes = box_list_ops.sort_by_field(sorted_boxes, fields.BoxListFields.scores) if change_coordinate_frame: sorted_boxes = box_list_ops.change_coordinate_frame( sorted_boxes, clip_window) if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxes.num_boxes()) sorted_boxes = box_list_ops.gather(sorted_boxes, tf.range(max_total_size)) num_valid_nms_boxes_cumulative = tf.where( max_total_size > num_valid_nms_boxes_cumulative, num_valid_nms_boxes_cumulative, max_total_size) # Select only the valid boxes if pad_to_max_output_size is False. if not pad_to_max_output_size: sorted_boxes = box_list_ops.gather( sorted_boxes, tf.range(num_valid_nms_boxes_cumulative)) return sorted_boxes, num_valid_nms_boxes_cumulative