def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (GPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ batch_size, num_boxes = scores.get_shape().as_list() topk_limit = min(num_boxes, rpn_pre_nms_topn) boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, height, width) if rpn_min_size > 0.0: boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: # Normalize coordinates as combined_non_max_suppression currently # only support normalized coordinates. pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1]) boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False) boxes = box_utils.to_absolute_coordinates(boxes, height, width) else: scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: a tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores stacked in batch_size. """ with tf.name_scope('generate_detections'): batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list( ) softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) * tf.ones([1, 1, num_classes - 1, 1])) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height = tf.expand_dims(image_info[:, 0:1], axis=-1) width = tf.expand_dims(image_info[:, 1:2], axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) (post_nms_boxes, post_nms_scores, post_nms_classes, post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates( post_nms_boxes, height, width) return (post_nms_num_valid_boxes, post_nms_boxes, tf.to_float(post_nms_classes), post_nms_scores)
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_id, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: a tensor with shape [batch_size] which specifies the image id of each image in the batch. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: a tensor of [batch_size, post_nms_num_detections, 7], which stacks `post_nms_num_detections` number of detection results for each image in the batch. Each detection is stored in the format of [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension. """ batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list() softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height, width, scale = tf.split(image_info[:, :3], num_or_size_splits=3, axis=-1) height = tf.expand_dims(height, axis=-1) width = tf.expand_dims(width, axis=-1) scale = tf.expand_dims(scale, axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) pre_nms_boxes = tf.reshape(boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( pre_nms_boxes, height, width) post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = ( tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height, width) # Only works with static batch size. # Unroll batch dimension. post_boxes_list = tf.unstack(post_nms_boxes) post_scores_list = tf.unstack(post_nms_scores) post_classes_list = tf.unstack(post_nms_classes) valid_boxes_list = tf.unstack(valid_boxes) image_id_list = tf.unstack(image_id) detections = [] for boxes_i, scores_i, classes_i, _, image_id_i in (zip( post_boxes_list, post_scores_list, post_classes_list, valid_boxes_list, image_id_list)): post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections]) post_nms_top_k_boxes = tf.reshape(boxes_i, [post_nms_num_detections, 4]) post_nms_top_k_classes = tf.reshape(classes_i, [post_nms_num_detections]) this_batch_detections = tf.stack([ tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)), post_nms_top_k_boxes[:, 0], post_nms_top_k_boxes[:, 1], post_nms_top_k_boxes[:, 2], post_nms_top_k_boxes[:, 3], post_nms_top_k_scores, tf.to_float(post_nms_top_k_classes), ], axis=1) detections.append(this_batch_detections) detections = tf.stack(detections, axis=0) return detections