Ejemplo n.º 1
0
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale,
                      rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold,
                      rpn_min_size, bbox_reg_weights):
    """Proposes RoIs giva group of candidates (GPU version).

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4],
      in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, num_boxes, 4].
    height: a tensor of shape [batch_size, 1, 1] representing the image height.
    width: a tensor of shape [batch_size, 1, 1] representing the image width.
    scale: a tensor of shape [batch_size, 1, 1] representing the image scale.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    bbox_reg_weights: None or a list of four integer specifying the weights used
      when decoding the box.

  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.
  """
    batch_size, num_boxes = scores.get_shape().as_list()

    topk_limit = min(num_boxes, rpn_pre_nms_topn)
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, height, width)

    if rpn_min_size > 0.0:
        boxes, scores = box_utils.filter_boxes(boxes,
                                               tf.expand_dims(scores, axis=-1),
                                               rpn_min_size, height, width,
                                               scale)
        scores = tf.squeeze(scores, axis=-1)

    post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else
                           rpn_post_nms_topn)
    if rpn_nms_threshold > 0:
        # Normalize coordinates as combined_non_max_suppression currently
        # only support normalized coordinates.
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, 1, 4])
        pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1])
        boxes, scores, _, _ = tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=topk_limit,
            max_total_size=post_nms_topk_limit,
            iou_threshold=rpn_nms_threshold,
            score_threshold=0.0,
            pad_per_class=False)
        boxes = box_utils.to_absolute_coordinates(boxes, height, width)
    else:
        scores, boxes = box_utils.top_k(scores,
                                        k=post_nms_topk_limit,
                                        boxes_list=[boxes])
        boxes = boxes[0]

    return scores, boxes
Ejemplo n.º 2
0
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    a tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores stacked
    in batch_size.
  """
    with tf.name_scope('generate_detections'):
        batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list(
        )
        softmax_class_outputs = tf.nn.softmax(class_outputs)

        # Remove background
        scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
        boxes = tf.slice(
            tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
            [0, 0, 1, 0], [-1, -1, -1, -1])

        anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
                        tf.ones([1, 1, num_classes - 1, 1]))

        num_detections = num_boxes * (num_classes - 1)

        boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
        scores = tf.reshape(scores, [batch_size, num_detections, 1])
        anchor_boxes = tf.reshape(anchor_boxes,
                                  [batch_size, num_detections, 4])

        # Decode
        boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

        # Clip boxes
        height = tf.expand_dims(image_info[:, 0:1], axis=-1)
        width = tf.expand_dims(image_info[:, 1:2], axis=-1)
        boxes = box_utils.clip_boxes(boxes, height, width)

        # NMS
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, num_classes - 1, 4])
        pre_nms_scores = tf.reshape(scores,
                                    [batch_size, num_boxes, num_classes - 1])
        (post_nms_boxes, post_nms_scores, post_nms_classes,
         post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression(
             pre_nms_boxes,
             pre_nms_scores,
             max_output_size_per_class=pre_nms_num_detections,
             max_total_size=post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.0,
             pad_per_class=False))
        post_nms_classes = post_nms_classes + 1
        post_nms_boxes = box_utils.to_absolute_coordinates(
            post_nms_boxes, height, width)
        return (post_nms_num_valid_boxes, post_nms_boxes,
                tf.to_float(post_nms_classes), post_nms_scores)
Ejemplo n.º 3
0
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_id,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_id: a tensor with shape [batch_size] which specifies the image id of
      each image in the batch.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: a tensor of [batch_size, post_nms_num_detections, 7], which
      stacks `post_nms_num_detections` number of detection results for each
      image in the batch. Each detection is stored in the format of
      [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension.
  """
    batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
    softmax_class_outputs = tf.nn.softmax(class_outputs)

    # Remove background
    scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
    boxes = tf.slice(
        tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
        [0, 0, 1, 0], [-1, -1, -1, -1])

    anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2),
                           [1, 1, num_classes - 1, 1])

    num_detections = num_boxes * (num_classes - 1)

    boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
    scores = tf.reshape(scores, [batch_size, num_detections, 1])
    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

    # Decode
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

    # Clip boxes
    height, width, scale = tf.split(image_info[:, :3],
                                    num_or_size_splits=3,
                                    axis=-1)
    height = tf.expand_dims(height, axis=-1)
    width = tf.expand_dims(width, axis=-1)
    scale = tf.expand_dims(scale, axis=-1)
    boxes = box_utils.clip_boxes(boxes, height, width)

    pre_nms_boxes = tf.reshape(boxes,
                               [batch_size, num_boxes, num_classes - 1, 4])
    pre_nms_scores = tf.reshape(scores,
                                [batch_size, num_boxes, num_classes - 1])

    # NMS
    pre_nms_boxes = box_utils.to_normalized_coordinates(
        pre_nms_boxes, height, width)
    post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = (
        tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=pre_nms_num_detections,
            max_total_size=post_nms_num_detections,
            iou_threshold=nms_threshold,
            score_threshold=0.0,
            pad_per_class=False))
    post_nms_classes = post_nms_classes + 1
    post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height,
                                                       width)

    # Only works with static batch size.
    # Unroll batch dimension.
    post_boxes_list = tf.unstack(post_nms_boxes)
    post_scores_list = tf.unstack(post_nms_scores)
    post_classes_list = tf.unstack(post_nms_classes)
    valid_boxes_list = tf.unstack(valid_boxes)
    image_id_list = tf.unstack(image_id)

    detections = []
    for boxes_i, scores_i, classes_i, _, image_id_i in (zip(
            post_boxes_list, post_scores_list, post_classes_list,
            valid_boxes_list, image_id_list)):
        post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections])
        post_nms_top_k_boxes = tf.reshape(boxes_i,
                                          [post_nms_num_detections, 4])
        post_nms_top_k_classes = tf.reshape(classes_i,
                                            [post_nms_num_detections])

        this_batch_detections = tf.stack([
            tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)),
            post_nms_top_k_boxes[:, 0],
            post_nms_top_k_boxes[:, 1],
            post_nms_top_k_boxes[:, 2],
            post_nms_top_k_boxes[:, 3],
            post_nms_top_k_scores,
            tf.to_float(post_nms_top_k_classes),
        ],
                                         axis=1)
        detections.append(this_batch_detections)
    detections = tf.stack(detections, axis=0)
    return detections