コード例 #1
0
def generate_detections_per_image_tpu(cls_outputs,
                                      box_outputs,
                                      anchor_boxes,
                                      image_info,
                                      pre_nms_num_detections=1000,
                                      post_nms_num_detections=100,
                                      nms_threshold=0.3,
                                      bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections per image given the model outputs.

  Args:
    cls_outputs: a tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
      regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: Tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores
    -- respectively.
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()

    # Remove background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False)
    classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.reshape(box_outputs,
                             [num_boxes, num_classes, 4])[:, 1:num_classes, :]
    class_indices = classes
    box_outputs = tf.gather_nd(
        box_outputs, tf.stack([top_k_indices, class_indices], axis=1))

    # apply bounding box regression to anchors
    boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1])

    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []
    # Skip background class.
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i),
                                  top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask
        # The TPU and CPU have different behaviors for
        # tf.image.non_max_suppression_padded (b/116754376).
        (class_i_post_nms_indices,
         class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
             tf.to_float(boxes),
             tf.to_float(class_i_scores),
             post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.05,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(class_i))
        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores,
                                            class_i_post_nms_indices)
        mask = tf.less(tf.range(post_nms_num_detections),
                       [class_i_nms_num_valid])
        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores,
            tf.zeros_like(class_i_post_nms_scores))
        class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores),
                                  class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores),
                                                  k=post_nms_num_detections,
                                                  sorted=True)
    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)

    valid_mask = tf.where(tf.greater(post_nms_scores, 0),
                          tf.ones_like(post_nms_scores),
                          tf.zeros_like(post_nms_scores))
    num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
    box_classes = tf.to_float(post_nms_classes)
    return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
コード例 #2
0
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    a tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores stacked
    in batch_size.
  """
    with tf.name_scope('generate_detections'):
        batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list(
        )
        softmax_class_outputs = tf.nn.softmax(class_outputs)

        # Remove background
        scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
        boxes = tf.slice(
            tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
            [0, 0, 1, 0], [-1, -1, -1, -1])

        anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
                        tf.ones([1, 1, num_classes - 1, 1]))

        num_detections = num_boxes * (num_classes - 1)

        boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
        scores = tf.reshape(scores, [batch_size, num_detections, 1])
        anchor_boxes = tf.reshape(anchor_boxes,
                                  [batch_size, num_detections, 4])

        # Decode
        boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

        # Clip boxes
        height = tf.expand_dims(image_info[:, 0:1], axis=-1)
        width = tf.expand_dims(image_info[:, 1:2], axis=-1)
        boxes = box_utils.clip_boxes(boxes, height, width)

        # NMS
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, num_classes - 1, 4])
        pre_nms_scores = tf.reshape(scores,
                                    [batch_size, num_boxes, num_classes - 1])
        (post_nms_boxes, post_nms_scores, post_nms_classes,
         post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression(
             pre_nms_boxes,
             pre_nms_scores,
             max_output_size_per_class=pre_nms_num_detections,
             max_total_size=post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.0,
             pad_per_class=False))
        post_nms_classes = post_nms_classes + 1
        post_nms_boxes = box_utils.to_absolute_coordinates(
            post_nms_boxes, height, width)
        return (post_nms_num_valid_boxes, post_nms_boxes,
                tf.to_float(post_nms_classes), post_nms_scores)
コード例 #3
0
ファイル: roi_ops.py プロジェクト: zzm422/tpu
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale,
                      rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold,
                      rpn_min_size, bbox_reg_weights):
    """Proposes RoIs giva group of candidates (GPU version).

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4],
      in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, num_boxes, 4].
    height: a tensor of shape [batch_size, 1, 1] representing the image height.
    width: a tensor of shape [batch_size, 1, 1] representing the image width.
    scale: a tensor of shape [batch_size, 1, 1] representing the image scale.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    bbox_reg_weights: None or a list of four integer specifying the weights used
      when decoding the box.

  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.
  """
    batch_size, num_boxes = scores.get_shape().as_list()

    topk_limit = min(num_boxes, rpn_pre_nms_topn)
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, height, width)

    if rpn_min_size > 0.0:
        boxes, scores = box_utils.filter_boxes(boxes,
                                               tf.expand_dims(scores, axis=-1),
                                               rpn_min_size, height, width,
                                               scale)
        scores = tf.squeeze(scores, axis=-1)

    post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else
                           rpn_post_nms_topn)
    if rpn_nms_threshold > 0:
        # Normalize coordinates as combined_non_max_suppression currently
        # only support normalized coordinates.
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, 1, 4])
        pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1])
        boxes, scores, _, _ = tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=topk_limit,
            max_total_size=post_nms_topk_limit,
            iou_threshold=rpn_nms_threshold,
            score_threshold=0.0,
            pad_per_class=False)
        boxes = box_utils.to_absolute_coordinates(boxes, height, width)
    else:
        scores, boxes = box_utils.top_k(scores,
                                        k=post_nms_topk_limit,
                                        boxes_list=[boxes])
        boxes = boxes[0]

    return scores, boxes
コード例 #4
0
ファイル: roi_ops.py プロジェクト: zzm422/tpu
def _propose_rois_tpu(scores, boxes, anchor_boxes, height, width, scale,
                      rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold,
                      rpn_min_size, bbox_reg_weights):
    """Proposes RoIs giva group of candidates (TPU version).

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4],
      in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, num_boxes, 4].
    height: a tensor of shape [batch_size, 1, 1] representing the image height.
    width: a tensor of shape [batch_size, 1, 1] representing the image width.
    scale: a tensor of shape [batch_size, 1, 1] representing the image scale.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    bbox_reg_weights: None or a list of four integer specifying the weights used
      when decoding the box.

  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.

  """
    _, num_boxes = scores.get_shape().as_list()

    topk_limit = (num_boxes
                  if num_boxes < rpn_pre_nms_topn else rpn_pre_nms_topn)
    scores, boxes_list = box_utils.top_k(scores,
                                         k=topk_limit,
                                         boxes_list=[boxes, anchor_boxes])
    boxes = boxes_list[0]
    anchor_boxes = boxes_list[1]

    # Decode boxes w.r.t. anchors and transform to the absoluate coordinates.
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

    # Clip boxes that exceed the boundary.
    boxes = box_utils.clip_boxes(boxes, height, width)

    # Filter boxes that one side is less than rpn_min_size threshold.
    boxes, scores = box_utils.filter_boxes(boxes,
                                           tf.expand_dims(scores, axis=-1),
                                           rpn_min_size, height, width, scale)
    scores = tf.squeeze(scores, axis=-1)

    post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else
                           rpn_post_nms_topn)
    # NMS.
    if rpn_nms_threshold > 0:
        scores, boxes = box_utils.sorted_non_max_suppression_padded(
            scores,
            boxes,
            max_output_size=post_nms_topk_limit,
            iou_threshold=rpn_nms_threshold)

    # Pick top-K post NMS'ed boxes.
    scores, boxes = box_utils.top_k(scores,
                                    k=post_nms_topk_limit,
                                    boxes_list=[boxes])
    boxes = boxes[0]
    return scores, boxes
コード例 #5
0
ファイル: postprocess_ops.py プロジェクト: yangfan255/tpu
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_id,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_id: a tensor with shape [batch_size] which specifies the image id of
      each image in the batch.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: a tensor of [batch_size, post_nms_num_detections, 7], which
      stacks `post_nms_num_detections` number of detection results for each
      image in the batch. Each detection is stored in the format of
      [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension.
  """
    batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
    softmax_class_outputs = tf.nn.softmax(class_outputs)

    # Remove background
    scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
    boxes = tf.slice(
        tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
        [0, 0, 1, 0], [-1, -1, -1, -1])

    anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2),
                           [1, 1, num_classes - 1, 1])

    num_detections = num_boxes * (num_classes - 1)

    boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
    scores = tf.reshape(scores, [batch_size, num_detections, 1])
    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

    # Decode
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

    # Clip boxes
    height, width, scale = tf.split(image_info[:, :3],
                                    num_or_size_splits=3,
                                    axis=-1)
    height = tf.expand_dims(height, axis=-1)
    width = tf.expand_dims(width, axis=-1)
    scale = tf.expand_dims(scale, axis=-1)
    boxes = box_utils.clip_boxes(boxes, height, width)

    pre_nms_boxes = tf.reshape(boxes,
                               [batch_size, num_boxes, num_classes - 1, 4])
    pre_nms_scores = tf.reshape(scores,
                                [batch_size, num_boxes, num_classes - 1])

    # NMS
    pre_nms_boxes = box_utils.to_normalized_coordinates(
        pre_nms_boxes, height, width)
    post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = (
        tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=pre_nms_num_detections,
            max_total_size=post_nms_num_detections,
            iou_threshold=nms_threshold,
            score_threshold=0.0,
            pad_per_class=False))
    post_nms_classes = post_nms_classes + 1
    post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height,
                                                       width)

    # Only works with static batch size.
    # Unroll batch dimension.
    post_boxes_list = tf.unstack(post_nms_boxes)
    post_scores_list = tf.unstack(post_nms_scores)
    post_classes_list = tf.unstack(post_nms_classes)
    valid_boxes_list = tf.unstack(valid_boxes)
    image_id_list = tf.unstack(image_id)

    detections = []
    for boxes_i, scores_i, classes_i, _, image_id_i in (zip(
            post_boxes_list, post_scores_list, post_classes_list,
            valid_boxes_list, image_id_list)):
        post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections])
        post_nms_top_k_boxes = tf.reshape(boxes_i,
                                          [post_nms_num_detections, 4])
        post_nms_top_k_classes = tf.reshape(classes_i,
                                            [post_nms_num_detections])

        this_batch_detections = tf.stack([
            tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)),
            post_nms_top_k_boxes[:, 0],
            post_nms_top_k_boxes[:, 1],
            post_nms_top_k_boxes[:, 2],
            post_nms_top_k_boxes[:, 3],
            post_nms_top_k_scores,
            tf.to_float(post_nms_top_k_classes),
        ],
                                         axis=1)
        detections.append(this_batch_detections)
    detections = tf.stack(detections, axis=0)
    return detections