def resize_and_crop_boxes(boxes, image_scale, output_size, offset): """Resizes boxes to output size with scale and offset. Args: boxes: `Tensor` of shape [N, 4] representing ground truth boxes. image_scale: 2D float `Tensor` representing scale factors that apply to [height, width] of input image. output_size: 2D `Tensor` or `int` representing [height, width] of target output image size. offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled boxes. Returns: boxes: `Tensor` of shape [N, 4] representing the scaled boxes. """ with tf.name_scope('resize_and_crop_boxes'): # Adjusts box coordinates based on image_scale and offset. boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) # Clips the boxes. boxes = box_ops.clip_boxes(boxes, output_size) return boxes
def undo_info(boxes: tf.Tensor, num_detections: int, info: tf.Tensor, expand: bool = True) -> tf.Tensor: """Clip and normalize boxes for serving.""" mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1]) boxes = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype) * boxes if expand: info = tf.cast(tf.expand_dims(info, axis=0), boxes.dtype) inshape = tf.expand_dims(info[:, 1, :], axis=1) ogshape = tf.expand_dims(info[:, 0, :], axis=1) scale = tf.expand_dims(info[:, 2, :], axis=1) offset = tf.expand_dims(info[:, 3, :], axis=1) boxes = box_ops.denormalize_boxes(boxes, inshape) boxes += tf.tile(offset, [1, 1, 2]) boxes /= tf.tile(scale, [1, 1, 2]) boxes = box_ops.clip_boxes(boxes, ogshape) boxes = box_ops.normalize_boxes(boxes, ogshape) return boxes
def _decode_multilevel_outputs( self, raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: Mapping[str, tf.Tensor], image_shape: tf.Tensor, raw_attributes: Optional[Mapping[str, tf.Tensor]] = None): """Collects dict of multilevel boxes, scores, attributes into lists.""" boxes = [] scores = [] if raw_attributes: attributes = {att_name: [] for att_name in raw_attributes.keys()} else: attributes = {} levels = list(raw_boxes.keys()) min_level = int(min(levels)) max_level = int(max(levels)) for i in range(min_level, max_level + 1): raw_boxes_i = raw_boxes[str(i)] raw_scores_i = raw_scores[str(i)] batch_size = tf.shape(raw_boxes_i)[0] (_, feature_h_i, feature_w_i, num_anchors_per_locations_times_4 ) = raw_boxes_i.get_shape().as_list() num_locations = feature_h_i * feature_w_i num_anchors_per_locations = num_anchors_per_locations_times_4 // 4 num_classes = raw_scores_i.get_shape().as_list( )[-1] // num_anchors_per_locations # Applies score transformation and remove the implicit background class. scores_i = tf.sigmoid( tf.reshape(raw_scores_i, [ batch_size, num_locations * num_anchors_per_locations, num_classes ])) scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) # Box decoding. # The anchor boxes are shared for all data in a batch. # One stage detector only supports class agnostic box regression. anchor_boxes_i = tf.reshape( anchor_boxes[str(i)], [batch_size, num_locations * num_anchors_per_locations, 4]) raw_boxes_i = tf.reshape( raw_boxes_i, [batch_size, num_locations * num_anchors_per_locations, 4]) boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i) # Box clipping. boxes_i = box_ops.clip_boxes(boxes_i, tf.expand_dims(image_shape, axis=1)) boxes.append(boxes_i) scores.append(scores_i) if raw_attributes: for att_name, raw_att in raw_attributes.items(): attribute_size = raw_att[str(i)].get_shape().as_list( )[-1] // num_anchors_per_locations att_i = tf.reshape(raw_att[str(i)], [ batch_size, num_locations * num_anchors_per_locations, attribute_size ]) attributes[att_name].append(att_i) boxes = tf.concat(boxes, axis=1) boxes = tf.expand_dims(boxes, axis=2) scores = tf.concat(scores, axis=1) if raw_attributes: for att_name in raw_attributes.keys(): attributes[att_name] = tf.concat(attributes[att_name], axis=1) attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2) return boxes, scores, attributes
def __call__(self, raw_boxes: tf.Tensor, raw_scores: tf.Tensor, anchor_boxes: tf.Tensor, image_shape: tf.Tensor, regression_weights: Optional[List[float]] = None, bbox_per_class: bool = True): """Generates final detections. Args: raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]` representing the class-specific box coordinates relative to anchors. raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]` representing the class logits before applying score activiation. anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing the corresponding anchor boxes w.r.t `box_outputs`. image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image height and width w.r.t. the scaled image, i.e. the same image space as `box_outputs` and `anchor_boxes`. regression_weights: A list of four float numbers to scale coordinates. bbox_per_class: A `bool`. If True, perform per-class box regression. Returns: If `apply_nms` = True, the return is a dictionary with keys: `detection_boxes`: A `float` tf.Tensor of shape [batch, max_num_detections, 4] representing top detected boxes in [y1, x1, y2, x2]. `detection_scores`: A `float` `tf.Tensor` of shape [batch, max_num_detections] representing sorted confidence scores for detected boxes. The values are between [0, 1]. `detection_classes`: An `int` tf.Tensor of shape [batch, max_num_detections] representing classes for detected boxes. `num_detections`: An `int` tf.Tensor of shape [batch] only the first `num_detections` boxes are valid detections If `apply_nms` = False, the return is a dictionary with keys: `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] representing all the decoded boxes. `decoded_box_scores`: A `float` tf.Tensor of shape [batch, num_raw_boxes] representing socres of all the decoded boxes. """ box_scores = tf.nn.softmax(raw_scores, axis=-1) # Removes the background class. box_scores_shape = tf.shape(box_scores) box_scores_shape_list = box_scores.get_shape().as_list() batch_size = box_scores_shape[0] num_locations = box_scores_shape_list[1] num_classes = box_scores_shape_list[-1] box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1]) if bbox_per_class: num_detections = num_locations * (num_classes - 1) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_locations, num_classes, 4]) raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Box decoding. decoded_boxes = box_ops.decode_boxes(raw_boxes, anchor_boxes, weights=regression_weights) # Box clipping decoded_boxes = box_ops.clip_boxes(decoded_boxes, tf.expand_dims(image_shape, axis=1)) if bbox_per_class: decoded_boxes = tf.reshape( decoded_boxes, [batch_size, num_locations, num_classes - 1, 4]) else: decoded_boxes = tf.expand_dims(decoded_boxes, axis=2) if not self._config_dict['apply_nms']: return { 'decoded_boxes': decoded_boxes, 'decoded_box_scores': box_scores, } # Optionally force the NMS be run on CPU. if self._config_dict['use_cpu_nms']: nms_context = tf.device('cpu:0') else: nms_context = contextlib.nullcontext() with nms_context: if self._config_dict['nms_version'] == 'batched': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (_generate_detections_batched( decoded_boxes, box_scores, self._config_dict['pre_nms_score_threshold'], self._config_dict['nms_iou_threshold'], self._config_dict['max_num_detections'])) elif self._config_dict['nms_version'] == 'v1': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (_generate_detections_v1( decoded_boxes, box_scores, pre_nms_top_k=self._config_dict['pre_nms_top_k'], pre_nms_score_threshold=self. _config_dict['pre_nms_score_threshold'], nms_iou_threshold=self._config_dict['nms_iou_threshold'], max_num_detections=self. _config_dict['max_num_detections'], soft_nms_sigma=self._config_dict['soft_nms_sigma'])) elif self._config_dict['nms_version'] == 'v2': (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (_generate_detections_v2( decoded_boxes, box_scores, pre_nms_top_k=self._config_dict['pre_nms_top_k'], pre_nms_score_threshold=self. _config_dict['pre_nms_score_threshold'], nms_iou_threshold=self._config_dict['nms_iou_threshold'], max_num_detections=self._config_dict['max_num_detections'] )) else: raise ValueError('NMS version {} not supported.'.format( self._config_dict['nms_version'])) # Adds 1 to offset the background class which has index 0. nmsed_classes += 1 return { 'num_detections': valid_detections, 'detection_boxes': nmsed_boxes, 'detection_classes': nmsed_classes, 'detection_scores': nmsed_scores, }
def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor], raw_scores: Mapping[str, tf.Tensor], anchor_boxes: Mapping[str, tf.Tensor], image_shape: tf.Tensor, pre_nms_top_k: int = 2000, pre_nms_score_threshold: float = 0.0, pre_nms_min_size_threshold: float = 0.0, nms_iou_threshold: float = 0.7, num_proposals: int = 1000, use_batched_nms: bool = False, decode_boxes: bool = True, clip_boxes: bool = True, apply_sigmoid_to_score: bool = True): """Proposes RoIs given a group of candidates from different FPN levels. The following describes the steps: 1. For each individual level: a. Apply sigmoid transform if specified. b. Decode boxes if specified. c. Clip boxes if specified. d. Filter small boxes and those fall outside image if specified. e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. f. Apply NMS. 2. Aggregate post-NMS boxes from each level. 3. Apply an overall top k to generate the final selected RoIs. Args: raw_boxes: A `dict` with keys representing FPN levels and values representing box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4]. raw_scores: A `dict` with keys representing FPN levels and values representing logit tensors of shape [batch_size, feature_h, feature_w, num_anchors]. anchor_boxes: A `dict` with keys representing FPN levels and values representing anchor box tensors of shape [batch_size, feature_h * feature_w * num_anchors, 4]. image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension are [height, width] of the scaled image. pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep before applying NMS. Default: 2000. pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal box score to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. pre_nms_min_size_threshold: A `float` representing the minimal box size in each side (w.r.t. the scaled image) to keep before applying NMS. This is often used as a pre-filtering step for better performance. Default: 0, no filtering is applied. nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold used for NMS. If 0.0, no NMS is applied. Default: 0.7. num_proposals: An `int` of top scoring RPN proposals *in total* to keep after applying NMS. Default: 1000. use_batched_nms: A `bool` indicating whether NMS is applied in batch using `tf.image.combined_non_max_suppression`. Currently only available in CPU/GPU. Default is False. decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded using `anchor_boxes`. If False, use `raw_boxes` directly and ignore `anchor_boxes`. Default is True. clip_boxes: A `bool` indicating whether boxes are first clipped to the scaled image size before appliying NMS. If False, no clipping is applied and `image_shape` is ignored. Default is True. apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to `raw_scores` before applying NMS. Default is True. Returns: selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4], representing the box coordinates of the selected proposals w.r.t. the scaled image. selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1], representing the scores of the selected proposals. """ with tf.name_scope('multilevel_propose_rois'): rois = [] roi_scores = [] image_shape = tf.expand_dims(image_shape, axis=1) for level in sorted(raw_scores.keys()): with tf.name_scope('level_%s' % level): _, feature_h, feature_w, num_anchors_per_location = ( raw_scores[level].get_shape().as_list()) num_boxes = feature_h * feature_w * num_anchors_per_location this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes]) this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4]) this_level_anchors = tf.cast(tf.reshape( anchor_boxes[level], [-1, num_boxes, 4]), dtype=this_level_scores.dtype) if apply_sigmoid_to_score: this_level_scores = tf.sigmoid(this_level_scores) if decode_boxes: this_level_boxes = box_ops.decode_boxes( this_level_boxes, this_level_anchors) if clip_boxes: this_level_boxes = box_ops.clip_boxes( this_level_boxes, image_shape) if pre_nms_min_size_threshold > 0.0: this_level_boxes, this_level_scores = box_ops.filter_boxes( this_level_boxes, this_level_scores, image_shape, pre_nms_min_size_threshold) this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k) this_level_post_nms_top_k = min(num_boxes, num_proposals) if nms_iou_threshold > 0.0: if use_batched_nms: this_level_rois, this_level_roi_scores, _, _ = ( tf.image.combined_non_max_suppression( tf.expand_dims(this_level_boxes, axis=2), tf.expand_dims(this_level_scores, axis=-1), max_output_size_per_class= this_level_pre_nms_top_k, max_total_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold, score_threshold=pre_nms_score_threshold, pad_per_class=False, clip_boxes=False)) else: if pre_nms_score_threshold > 0.0: this_level_boxes, this_level_scores = ( box_ops.filter_boxes_by_scores( this_level_boxes, this_level_scores, pre_nms_score_threshold)) this_level_boxes, this_level_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) this_level_roi_scores, this_level_rois = ( nms.sorted_non_max_suppression_padded( this_level_scores, this_level_boxes, max_output_size=this_level_post_nms_top_k, iou_threshold=nms_iou_threshold)) else: this_level_rois, this_level_roi_scores = box_ops.top_k_boxes( this_level_boxes, this_level_scores, k=this_level_post_nms_top_k) rois.append(this_level_rois) roi_scores.append(this_level_roi_scores) all_rois = tf.concat(rois, axis=1) all_roi_scores = tf.concat(roi_scores, axis=1) with tf.name_scope('top_k_rois'): _, num_valid_rois = all_roi_scores.get_shape().as_list() overall_top_k = min(num_valid_rois, num_proposals) selected_rois, selected_roi_scores = box_ops.top_k_boxes( all_rois, all_roi_scores, k=overall_top_k) return selected_rois, selected_roi_scores
def _call_box_outputs( self, images: tf.Tensor, image_shape: tf.Tensor, anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, gt_boxes: Optional[tf.Tensor] = None, gt_classes: Optional[tf.Tensor] = None, training: Optional[bool] = None) -> Tuple[ Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]: """Implementation of the Faster-RCNN logic for boxes.""" model_outputs = {} # Feature extraction. (backbone_features, decoder_features) = self._get_backbone_and_decoder_features(images) # Region proposal network. rpn_scores, rpn_boxes = self.rpn_head(decoder_features) model_outputs.update({ 'backbone_features': backbone_features, 'decoder_features': decoder_features, 'rpn_boxes': rpn_boxes, 'rpn_scores': rpn_scores }) # Generate anchor boxes for this batch if not provided. if anchor_boxes is None: _, image_height, image_width, _ = images.get_shape().as_list() anchor_boxes = anchor.Anchor( min_level=self._config_dict['min_level'], max_level=self._config_dict['max_level'], num_scales=self._config_dict['num_scales'], aspect_ratios=self._config_dict['aspect_ratios'], anchor_size=self._config_dict['anchor_size'], image_size=(image_height, image_width)).multilevel_boxes for l in anchor_boxes: anchor_boxes[l] = tf.tile( tf.expand_dims(anchor_boxes[l], axis=0), [tf.shape(images)[0], 1, 1, 1]) # Generate RoIs. current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes, image_shape, training) next_rois = current_rois all_class_outputs = [] for cascade_num in range(len(self.roi_sampler)): # In cascade RCNN we want the higher layers to have different regression # weights as the predicted deltas become smaller and smaller. regression_weights = self._cascade_layer_to_weights[cascade_num] current_rois = next_rois (class_outputs, box_outputs, model_outputs, matched_gt_boxes, matched_gt_classes, matched_gt_indices, current_rois) = self._run_frcnn_head( features=decoder_features, rois=current_rois, gt_boxes=gt_boxes, gt_classes=gt_classes, training=training, model_outputs=model_outputs, cascade_num=cascade_num, regression_weights=regression_weights) all_class_outputs.append(class_outputs) # Generate ROIs for the next cascade head if there is any. if cascade_num < len(self.roi_sampler) - 1: next_rois = box_ops.decode_boxes( tf.cast(box_outputs, tf.float32), current_rois, weights=regression_weights) next_rois = box_ops.clip_boxes(next_rois, tf.expand_dims(image_shape, axis=1)) if not training: if self._config_dict['cascade_class_ensemble']: class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs) detections = self.detection_generator( box_outputs, class_outputs, current_rois, image_shape, regression_weights, bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred'])) model_outputs.update({ 'cls_outputs': class_outputs, 'box_outputs': box_outputs, }) if self.detection_generator.get_config()['apply_nms']: model_outputs.update({ 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] }) else: model_outputs.update({ 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] }) intermediate_outputs = { 'matched_gt_boxes': matched_gt_boxes, 'matched_gt_indices': matched_gt_indices, 'matched_gt_classes': matched_gt_classes, 'current_rois': current_rois, } return (model_outputs, intermediate_outputs)
def affine_warp_boxes(affine, boxes, output_size, box_history): """Applies random rotation, random perspective change and random translation. and random scaling to the boxes. Args: affine: A `Tensor` for the augmenting matrix for the boxes. boxes: A `Tensor` for the boxes. output_size: A `list` of two integers, a two-element vector or a tensor such that all but the last dimensions are `broadcastable` to `boxes`. The last dimension is 2, which represents [height, width]. box_history: A `Tensor` for the boxes history, which are the boxes that undergo the same augmentations as `boxes`, but no clipping was applied. We can keep track of how much changes are done to the boxes by keeping track of this tensor. Returns: clipped_boxes: A `Tensor` representing the augmented boxes. box_history: A `Tensor` representing the augmented box_history. """ def _get_corners(box): """Get the corner of each box as a tuple of (x, y) coordinates.""" ymi, xmi, yma, xma = tf.split(box, 4, axis=-1) tl = tf.concat([xmi, ymi], axis=-1) bl = tf.concat([xmi, yma], axis=-1) tr = tf.concat([xma, ymi], axis=-1) br = tf.concat([xma, yma], axis=-1) return tf.concat([tl, bl, tr, br], axis=-1) def _corners_to_boxes(corner): """Convert (x, y) corners back into boxes [ymin, xmin, ymax, xmax].""" corner = tf.reshape(corner, [-1, 4, 2]) y = corner[..., 1] x = corner[..., 0] y_min = tf.reduce_min(y, axis=-1) x_min = tf.reduce_min(x, axis=-1) y_max = tf.reduce_max(y, axis=-1) x_max = tf.reduce_max(x, axis=-1) return tf.stack([y_min, x_min, y_max, x_max], axis=-1) def _aug_boxes(affine_matrix, box): """Apply an affine transformation matrix M to the boxes augment boxes.""" corners = _get_corners(box) corners = tf.reshape(corners, [-1, 4, 2]) z = tf.expand_dims(tf.ones_like(corners[..., 1]), axis=-1) corners = tf.concat([corners, z], axis=-1) corners = tf.transpose( tf.matmul(affine_matrix, corners, transpose_b=True), perm=(0, 2, 1)) corners, p = tf.split(corners, [2, 1], axis=-1) corners /= p corners = tf.reshape(corners, [-1, 8]) box = _corners_to_boxes(corners) return box boxes = _aug_boxes(affine, boxes) box_history = _aug_boxes(affine, box_history) clipped_boxes = bbox_ops.clip_boxes(boxes, output_size) return clipped_boxes, box_history