def generate_detections_per_image_tpu(cls_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections per image given the model outputs. Args: cls_outputs: a tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: Tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores -- respectively. """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Remove background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] class_indices = classes box_outputs = tf.gather_nd( box_outputs, tf.stack([top_k_indices, class_indices], axis=1)) # apply bounding box regression to anchors boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1]) list_of_all_boxes = [] list_of_all_scores = [] list_of_all_classes = [] # Skip background class. for class_i in range(num_classes): # Compute bitmask for the given classes. class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype) # This works because score is in [0, 1]. class_i_scores = top_k_scores * class_i_bitmask # The TPU and CPU have different behaviors for # tf.image.non_max_suppression_padded (b/116754376). (class_i_post_nms_indices, class_i_nms_num_valid) = tf.image.non_max_suppression_padded( tf.to_float(boxes), tf.to_float(class_i_scores), post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.05, pad_to_max_output_size=True, name='nms_detections_' + str(class_i)) class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices) class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices) mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid]) class_i_post_nms_scores = tf.where( mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)) class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i + 1) list_of_all_boxes.append(class_i_post_nms_boxes) list_of_all_scores.append(class_i_post_nms_scores) list_of_all_classes.append(class_i_classes) post_nms_boxes = tf.concat(list_of_all_boxes, axis=0) post_nms_scores = tf.concat(list_of_all_scores, axis=0) post_nms_classes = tf.concat(list_of_all_classes, axis=0) # sort all results. post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores), k=post_nms_num_detections, sorted=True) post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices) post_nms_classes = tf.gather(post_nms_classes, sorted_indices) valid_mask = tf.where(tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores), tf.zeros_like(post_nms_scores)) num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1) box_classes = tf.to_float(post_nms_classes) return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: a tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores stacked in batch_size. """ with tf.name_scope('generate_detections'): batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list( ) softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) * tf.ones([1, 1, num_classes - 1, 1])) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height = tf.expand_dims(image_info[:, 0:1], axis=-1) width = tf.expand_dims(image_info[:, 1:2], axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) (post_nms_boxes, post_nms_scores, post_nms_classes, post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates( post_nms_boxes, height, width) return (post_nms_num_valid_boxes, post_nms_boxes, tf.to_float(post_nms_classes), post_nms_scores)
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (GPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ batch_size, num_boxes = scores.get_shape().as_list() topk_limit = min(num_boxes, rpn_pre_nms_topn) boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, height, width) if rpn_min_size > 0.0: boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: # Normalize coordinates as combined_non_max_suppression currently # only support normalized coordinates. pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1]) boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False) boxes = box_utils.to_absolute_coordinates(boxes, height, width) else: scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def _propose_rois_tpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (TPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ _, num_boxes = scores.get_shape().as_list() topk_limit = (num_boxes if num_boxes < rpn_pre_nms_topn else rpn_pre_nms_topn) scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] # Decode boxes w.r.t. anchors and transform to the absoluate coordinates. boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes that exceed the boundary. boxes = box_utils.clip_boxes(boxes, height, width) # Filter boxes that one side is less than rpn_min_size threshold. boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) # NMS. if rpn_nms_threshold > 0: scores, boxes = box_utils.sorted_non_max_suppression_padded( scores, boxes, max_output_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold) # Pick top-K post NMS'ed boxes. scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_id, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: a tensor with shape [batch_size] which specifies the image id of each image in the batch. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: a tensor of [batch_size, post_nms_num_detections, 7], which stacks `post_nms_num_detections` number of detection results for each image in the batch. Each detection is stored in the format of [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension. """ batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list() softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height, width, scale = tf.split(image_info[:, :3], num_or_size_splits=3, axis=-1) height = tf.expand_dims(height, axis=-1) width = tf.expand_dims(width, axis=-1) scale = tf.expand_dims(scale, axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) pre_nms_boxes = tf.reshape(boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( pre_nms_boxes, height, width) post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = ( tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height, width) # Only works with static batch size. # Unroll batch dimension. post_boxes_list = tf.unstack(post_nms_boxes) post_scores_list = tf.unstack(post_nms_scores) post_classes_list = tf.unstack(post_nms_classes) valid_boxes_list = tf.unstack(valid_boxes) image_id_list = tf.unstack(image_id) detections = [] for boxes_i, scores_i, classes_i, _, image_id_i in (zip( post_boxes_list, post_scores_list, post_classes_list, valid_boxes_list, image_id_list)): post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections]) post_nms_top_k_boxes = tf.reshape(boxes_i, [post_nms_num_detections, 4]) post_nms_top_k_classes = tf.reshape(classes_i, [post_nms_num_detections]) this_batch_detections = tf.stack([ tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)), post_nms_top_k_boxes[:, 0], post_nms_top_k_boxes[:, 1], post_nms_top_k_boxes[:, 2], post_nms_top_k_boxes[:, 3], post_nms_top_k_scores, tf.to_float(post_nms_top_k_classes), ], axis=1) detections.append(this_batch_detections) detections = tf.stack(detections, axis=0) return detections