def resize_crop_pad(image,
                    desired_output_size,
                    stride,
                    aug_scale_min=1.0,
                    aug_scale_max=1.0,
                    boxes=None,
                    classes=None,
                    masks=None,
                    crop_mask_size=112):
  """Resize, crop and pad images, boxes and masks (RetinaNet style).

  Resize, crop and pad images, (optionally boxes and masks) given the desired
  output size of the image and the stride size.

  Here are the preprocessing steps.
  1. For a given image, keep its aspect ratio and rescale the image to make it
     the largest rectangle to be bounded by the rectangle specified by the
     `desired_output_size`.
  2. Pad the rescaled image such that the height and width of the image become
     the smallest multiple of the stride that is larger or equal to the desired
     output diemension.

  Args:
    image: an image tensor of shape [original_height, original_width, 3].
    desired_output_size: a tuple of two integers indicating the desired output
      image size. Note that the actual output size could be different from this.
    stride: the stride of the backbone network. Each of the output image sides
      must be the multiple of this.
    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
      random scale applied to desired_size for training scale jittering.
    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
      random scale applied to desired_size for training scale jittering.
    boxes: (Optional) a tensor of shape [num_boxes, 4] represneting the box
      corners in normalized coordinates.
    classes: (Optional) a tensor of shape [num_boxes] representing the box
      classes.
    masks: (Optional) a tensor of shape [num_boxes, image_height, image_width]
      representing the instance masks which have the same shape as the input
      image.
    crop_mask_size: an integer indicating the size of the cropped mask.

  Returns:
    image: the processed image tensor after being resized and padded.
    image_info: a tensor of shape [5] which encodes the height, width before
      and after resizing and the scaling factor.
    boxes: None or the processed box tensor after being resized and padded.
      After the processing, boxes will be in the absolute coordinates w.r.t.
      the scaled image.
    classes: None or the processed class tensor after boxes being resized and
      filtered.
    masks: None or the processed mask tensor after being resized.
  """
  if boxes is not None:
    assert classes is not None

  input_shape = tf.shape(image)
  input_height = tf.cast(input_shape[0], dtype=tf.float32)
  input_width = tf.cast(input_shape[1], dtype=tf.float32)
  desired_height, desired_width = desired_output_size

  # Find the scale factor such that the scaled image is surrounded by the
  # rectangle of shape of desired_output_size.
  scale_if_resize_height = desired_height / input_height
  scale_if_resize_width = desired_width / input_width
  scale = tf.minimum(scale_if_resize_height, scale_if_resize_width)
  desired_scaled_height = scale * input_height
  desired_scaled_width = scale * input_width
  desired_scaled_size = tf.stack(
      [desired_scaled_height, desired_scaled_width], axis=0)

  random_jittering = aug_scale_min != 1.0 or aug_scale_max != 1.0

  if random_jittering:
    random_scale = tf.random_uniform([], aug_scale_min, aug_scale_max)
    scale = random_scale * scale
    scaled_size = tf.round(random_scale * desired_scaled_size)
  else:
    scaled_size = desired_scaled_size
  scaled_size_int = tf.cast(scaled_size, dtype=tf.int32)
  desired_scaled_size_int = tf.cast(desired_scaled_size, dtype=tf.int32)

  image = tf.image.resize_images(
      image,
      scaled_size_int,
      method=tf.image.ResizeMethod.BILINEAR)

  if boxes is not None:
    normalized_boxes = boxes
    # Convert the normalized coordinates to the coordinates w.r.t.
    # the scaled image.
    boxes = boxes * tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2])

    if masks is not None and not random_jittering:
      num_instances = tf.shape(boxes)[0]
      masks = tf.image.crop_and_resize(
          image=tf.expand_dims(masks, axis=-1),
          boxes=normalized_boxes,
          box_indices=tf.range(num_instances, dtype=tf.int32),
          crop_size=[crop_mask_size, crop_mask_size],
          method='bilinear')
      masks = tf.squeeze(masks, axis=-1)

  if random_jittering:
    max_offset = scaled_size - desired_scaled_size
    max_offset = tf.where(
        tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
    offset = tf.cast(
        max_offset * tf.random_uniform((2,), 0, 1), dtype=tf.int32)

    image = image[
        offset[0]:offset[0] + desired_scaled_size_int[0],
        offset[1]:offset[1] + desired_scaled_size_int[1],
        :]

    if boxes is not None:
      box_offsets = tf.cast(
          tf.tile(tf.expand_dims(offset, axis=0), [1, 2]),
          dtype=tf.float32)
      boxes -= box_offsets
      boxes = box_utils.clip_boxes(
          boxes, desired_scaled_size_int[0], desired_scaled_size_int[1])
      indices = tf.where(tf.logical_and(
          tf.greater(boxes[:, 2] - boxes[:, 0], 0),
          tf.greater(boxes[:, 3] - boxes[:, 1], 0)))[:, 0]
      boxes = tf.gather(boxes, indices)
      classes = tf.gather(classes, indices)
      if masks is not None:
        masks = tf.gather(masks, indices)

        # Convert the processed boxes back to the normalized coordinates w.r.t.
        # the original image in order to crop and resize the instance masks.
        cropped_boxes = boxes + box_offsets
        cropped_boxes /= tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2])

        num_instances = tf.shape(boxes)[0]
        masks = tf.image.crop_and_resize(
            image=tf.expand_dims(masks, axis=-1),
            boxes=cropped_boxes,
            box_indices=tf.range(num_instances, dtype=tf.int32),
            crop_size=[crop_mask_size, crop_mask_size],
            method='bilinear')
        masks = tf.squeeze(masks, axis=-1)

  # Pad image such that its height and width are the closest multiple of stride.
  padded_height = int(math.ceil(desired_height * 1.0 / stride) * stride)
  padded_width = int(math.ceil(desired_width * 1.0 / stride) * stride)
  image = tf.image.pad_to_bounding_box(
      image, 0, 0, padded_height, padded_width)
  image.set_shape([padded_height, padded_width, 3])

  # desired_scaled_size is the actual image size. Pixels beyond this are from
  # padding.
  image_info = tf.stack([
      desired_scaled_size[0],
      desired_scaled_size[1],
      1.0 / scale,
      input_height,
      input_width])

  return image, image_info, boxes, classes, masks
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    a tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores stacked
    in batch_size.
  """
    with tf.name_scope('generate_detections'):
        batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list(
        )
        softmax_class_outputs = tf.nn.softmax(class_outputs)

        # Remove background
        scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
        boxes = tf.slice(
            tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
            [0, 0, 1, 0], [-1, -1, -1, -1])

        anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
                        tf.ones([1, 1, num_classes - 1, 1]))

        num_detections = num_boxes * (num_classes - 1)

        boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
        scores = tf.reshape(scores, [batch_size, num_detections, 1])
        anchor_boxes = tf.reshape(anchor_boxes,
                                  [batch_size, num_detections, 4])

        # Decode
        boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

        # Clip boxes
        height = tf.expand_dims(image_info[:, 0:1], axis=-1)
        width = tf.expand_dims(image_info[:, 1:2], axis=-1)
        boxes = box_utils.clip_boxes(boxes, height, width)

        # NMS
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, num_classes - 1, 4])
        pre_nms_scores = tf.reshape(scores,
                                    [batch_size, num_boxes, num_classes - 1])
        (post_nms_boxes, post_nms_scores, post_nms_classes,
         post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression(
             pre_nms_boxes,
             pre_nms_scores,
             max_output_size_per_class=pre_nms_num_detections,
             max_total_size=post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.0,
             pad_per_class=False))
        post_nms_classes = post_nms_classes + 1
        post_nms_boxes = box_utils.to_absolute_coordinates(
            post_nms_boxes, height, width)
        return (post_nms_num_valid_boxes, post_nms_boxes,
                tf.to_float(post_nms_classes), post_nms_scores)
def generate_detections_per_image_tpu(cls_outputs,
                                      box_outputs,
                                      anchor_boxes,
                                      image_info,
                                      pre_nms_num_detections=1000,
                                      post_nms_num_detections=100,
                                      nms_threshold=0.3,
                                      bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections per image given the model outputs.

  Args:
    cls_outputs: a tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
      regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: Tuple of tensors corresponding to number of valid boxes,
    box coordinates, object categories for each boxes, and box scores
    -- respectively.
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()

    # Remove background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False)
    classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    box_outputs = tf.reshape(box_outputs,
                             [num_boxes, num_classes, 4])[:, 1:num_classes, :]
    class_indices = classes
    box_outputs = tf.gather_nd(
        box_outputs, tf.stack([top_k_indices, class_indices], axis=1))

    # apply bounding box regression to anchors
    boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1])

    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []
    # Skip background class.
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i),
                                  top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask
        # The TPU and CPU have different behaviors for
        # tf.image.non_max_suppression_padded (b/116754376).
        (class_i_post_nms_indices,
         class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
             tf.to_float(boxes),
             tf.to_float(class_i_scores),
             post_nms_num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.05,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(class_i))
        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores,
                                            class_i_post_nms_indices)
        mask = tf.less(tf.range(post_nms_num_detections),
                       [class_i_nms_num_valid])
        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores,
            tf.zeros_like(class_i_post_nms_scores))
        class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores),
                                  class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores),
                                                  k=post_nms_num_detections,
                                                  sorted=True)
    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)

    valid_mask = tf.where(tf.greater(post_nms_scores, 0),
                          tf.ones_like(post_nms_scores),
                          tf.zeros_like(post_nms_scores))
    num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
    box_classes = tf.to_float(post_nms_classes)
    return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
Example #4
0
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale,
                      rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold,
                      rpn_min_size, bbox_reg_weights):
    """Proposes RoIs giva group of candidates (GPU version).

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4],
      in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, num_boxes, 4].
    height: a tensor of shape [batch_size, 1, 1] representing the image height.
    width: a tensor of shape [batch_size, 1, 1] representing the image width.
    scale: a tensor of shape [batch_size, 1, 1] representing the image scale.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    bbox_reg_weights: None or a list of four integer specifying the weights used
      when decoding the box.

  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.
  """
    batch_size, num_boxes = scores.get_shape().as_list()

    topk_limit = min(num_boxes, rpn_pre_nms_topn)
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)
    boxes = box_utils.clip_boxes(boxes, height, width)

    if rpn_min_size > 0.0:
        boxes, scores = box_utils.filter_boxes(boxes,
                                               tf.expand_dims(scores, axis=-1),
                                               rpn_min_size, height, width,
                                               scale)
        scores = tf.squeeze(scores, axis=-1)

    post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else
                           rpn_post_nms_topn)
    if rpn_nms_threshold > 0:
        # Normalize coordinates as combined_non_max_suppression currently
        # only support normalized coordinates.
        pre_nms_boxes = box_utils.to_normalized_coordinates(
            boxes, height, width)
        pre_nms_boxes = tf.reshape(pre_nms_boxes,
                                   [batch_size, num_boxes, 1, 4])
        pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1])
        boxes, scores, _, _ = tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=topk_limit,
            max_total_size=post_nms_topk_limit,
            iou_threshold=rpn_nms_threshold,
            score_threshold=0.0,
            pad_per_class=False)
        boxes = box_utils.to_absolute_coordinates(boxes, height, width)
    else:
        scores, boxes = box_utils.top_k(scores,
                                        k=post_nms_topk_limit,
                                        boxes_list=[boxes])
        boxes = boxes[0]

    return scores, boxes
Example #5
0
def _propose_rois_tpu(scores, boxes, anchor_boxes, height, width, scale,
                      rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold,
                      rpn_min_size, bbox_reg_weights):
    """Proposes RoIs giva group of candidates (TPU version).

  Args:
    scores: a tensor with a shape of [batch_size, num_boxes].
    boxes: a tensor with a shape of [batch_size, num_boxes, 4],
      in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, num_boxes, 4].
    height: a tensor of shape [batch_size, 1, 1] representing the image height.
    width: a tensor of shape [batch_size, 1, 1] representing the image width.
    scale: a tensor of shape [batch_size, 1, 1] representing the image scale.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    bbox_reg_weights: None or a list of four integer specifying the weights used
      when decoding the box.

  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.

  """
    _, num_boxes = scores.get_shape().as_list()

    topk_limit = (num_boxes
                  if num_boxes < rpn_pre_nms_topn else rpn_pre_nms_topn)
    scores, boxes_list = box_utils.top_k(scores,
                                         k=topk_limit,
                                         boxes_list=[boxes, anchor_boxes])
    boxes = boxes_list[0]
    anchor_boxes = boxes_list[1]

    # Decode boxes w.r.t. anchors and transform to the absoluate coordinates.
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

    # Clip boxes that exceed the boundary.
    boxes = box_utils.clip_boxes(boxes, height, width)

    # Filter boxes that one side is less than rpn_min_size threshold.
    boxes, scores = box_utils.filter_boxes(boxes,
                                           tf.expand_dims(scores, axis=-1),
                                           rpn_min_size, height, width, scale)
    scores = tf.squeeze(scores, axis=-1)

    post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else
                           rpn_post_nms_topn)
    # NMS.
    if rpn_nms_threshold > 0:
        scores, boxes = box_utils.sorted_non_max_suppression_padded(
            scores,
            boxes,
            max_output_size=post_nms_topk_limit,
            iou_threshold=rpn_nms_threshold)

    # Pick top-K post NMS'ed boxes.
    scores, boxes = box_utils.top_k(scores,
                                    k=post_nms_topk_limit,
                                    boxes_list=[boxes])
    boxes = boxes[0]
    return scores, boxes
Example #6
0
def generate_detections_per_image_op(cls_outputs,
                                     box_outputs,
                                     anchor_boxes,
                                     image_id,
                                     image_info,
                                     num_detections=100,
                                     pre_nms_num_detections=1000,
                                     nms_threshold=0.3,
                                     bbox_reg_weights=(10., 10., 5., 5.)):
    """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a Tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a Tensor with shape [N, 4] or [N, num_classes*4], which stacks
      box regression outputs on all feature levels. The N is the number of total
      anchors on all levels. The tensor shape is [N, num_classes*4] when class
      specific box regression is used.
    anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_id: an integer number to specify the image id.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    num_detections: Number of detections after NMS.
    pre_nms_num_detections: Number of candidates before NMS.
    nms_threshold: a float number to specify the threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.
  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, ymin, xmin, ymax, xmax, score, class]
  """
    num_boxes, num_classes = cls_outputs.get_shape().as_list()
    _, num_box_predictions = box_outputs.get_shape().as_list()
    use_class_specific_box_regression = (num_classes == num_box_predictions /
                                         4)

    # Remove background class scores.
    cls_outputs = cls_outputs[:, 1:num_classes]
    top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
        tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False)
    classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
    top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

    anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
    if use_class_specific_box_regression:
        box_outputs = tf.reshape(box_outputs,
                                 [num_boxes, num_classes, 4])[:,
                                                              1:num_classes, :]
        class_indices = classes
    else:
        box_outputs = tf.reshape(box_outputs, [num_boxes, 1, 4])
        class_indices = tf.zeros_like(top_k_indices)
    box_outputs = tf.gather_nd(
        box_outputs, tf.stack([top_k_indices, class_indices], axis=1))

    # apply bounding box regression to anchors
    boxes = box_utils.batch_decode_box_outputs_op(
        tf.expand_dims(anchor_boxes, axis=0),
        tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0]
    boxes = box_utils.clip_boxes(tf.expand_dims(boxes, axis=0),
                                 tf.expand_dims(image_info[:2], axis=0))[0]

    list_of_all_boxes = []
    list_of_all_scores = []
    list_of_all_classes = []
    # Skip background class.
    for class_i in range(num_classes):
        # Compute bitmask for the given classes.
        class_i_bitmask = tf.cast(tf.equal(classes, class_i),
                                  top_k_scores.dtype)
        # This works because score is in [0, 1].
        class_i_scores = top_k_scores * class_i_bitmask
        # The TPU and CPU have different behaviors for
        # tf.image.non_max_suppression_padded (b/116754376).
        (class_i_post_nms_indices,
         class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
             tf.to_float(boxes),
             tf.to_float(class_i_scores),
             num_detections,
             iou_threshold=nms_threshold,
             score_threshold=0.05,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(class_i))
        class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
        class_i_post_nms_scores = tf.gather(class_i_scores,
                                            class_i_post_nms_indices)
        mask = tf.less(tf.range(num_detections), [class_i_nms_num_valid])
        class_i_post_nms_scores = tf.where(
            mask, class_i_post_nms_scores,
            tf.zeros_like(class_i_post_nms_scores))
        class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores),
                                  class_i + 1)
        list_of_all_boxes.append(class_i_post_nms_boxes)
        list_of_all_scores.append(class_i_post_nms_scores)
        list_of_all_classes.append(class_i_classes)

    post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
    post_nms_scores = tf.concat(list_of_all_scores, axis=0)
    post_nms_classes = tf.concat(list_of_all_classes, axis=0)

    # sort all results.
    post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores),
                                                  k=num_detections,
                                                  sorted=True)

    post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
    post_nms_classes = tf.gather(post_nms_classes, sorted_indices)

    if isinstance(image_id, int):
        image_id = tf.constant(image_id)
    image_id = tf.reshape(image_id, [])
    detections_result = tf.stack([
        tf.to_float(tf.fill(tf.shape(post_nms_scores), image_id)),
        post_nms_boxes[:, 0],
        post_nms_boxes[:, 1],
        post_nms_boxes[:, 2],
        post_nms_boxes[:, 3],
        post_nms_scores,
        tf.to_float(post_nms_classes),
    ],
                                 axis=1)
    return detections_result
Example #7
0
def _proposal_op_per_level(scores, boxes, anchor_boxes, image_info,
                           rpn_pre_nms_topn, rpn_post_nms_topn,
                           rpn_nms_threshold, rpn_min_size, level):
    """Proposes RoIs for the second stage nets.

  This proposal op performs the following operations.
    1. for each location i in a (H, W) grid:
         generate A anchor boxes centered on cell i
         apply predicted bbox deltas to each of the A anchors at cell i
    2. clip predicted boxes to image
    3. remove predicted boxes with either height or width < threshold
    4. sort all (proposal, score) pairs by score from highest to lowest
    5. take the top rpn_pre_nms_topn proposals before NMS
    6. apply NMS with a loose threshold (0.7) to the remaining proposals
    7. take after_nms_topN proposals after NMS
    8. return the top proposals
  Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/ops/generate_proposals.py  # pylint: disable=line-too-long

  Args:
    scores: a tensor with a shape of
      [batch_size, height, width, num_anchors].
    boxes: a tensor with a shape of
      [batch_size, height, width, num_anchors * 4], in the encoded form.
    anchor_boxes: an Anchors object that contains the anchors with a shape of
      [batch_size, height, width, num_anchors * 4].
    image_info: a tensor of shape [batch_size, 5] where the three columns
      encode the input image's [height, width, scale,
      original_height, original_width]. Height and width are for
      the input to the network, not the original image; scale is the scale
      factor used to scale the network input size to the original image size.
      See dataloader.DetectionInputProcessor for details. The last two are
      original height and width. See dataloader.DetectionInputProcessor for
      details.
    rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep
      before applying NMS. This is *per FPN level* (not total).
    rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep
      after applying NMS. This is the total number of RPN proposals produced.
    rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold
      used on RPN proposals.
    rpn_min_size: a integer number as the minimum proposal height and width as
      both need to be greater than this number. Note that this number is at
      origingal image scale; not scale used during training or inference).
    level: a integer number for the level that the function operates on.
  Returns:
    scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1]
      representing the scores of the proposals. It has same dtype as input
      scores.
    boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4]
      represneting the boxes of the proposals. The boxes are in normalized
      coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as
      input boxes.

  """
    with tf.name_scope('proposal-l%d' % level):
        # 4. sort all (proposal, score) pairs by score from highest to lowest
        # 5. take the top rpn_pre_nms_topn proposals before NMS
        batch_size, h, w, num_anchors = scores.get_shape().as_list()
        scores = tf.reshape(scores, [batch_size, -1])
        boxes = tf.reshape(boxes, [batch_size, -1, 4])
        # Map scores to [0, 1] for convenince of setting min score.
        scores = tf.sigmoid(scores)

        topk_limit = (h * w *
                      num_anchors if h * w * num_anchors < rpn_pre_nms_topn
                      else rpn_pre_nms_topn)
        anchor_boxes = tf.reshape(anchor_boxes, [batch_size, -1, 4])
        scores, boxes_list = box_utils.top_k(scores,
                                             k=topk_limit,
                                             boxes_list=[boxes, anchor_boxes])
        boxes = boxes_list[0]
        anchor_boxes = boxes_list[1]

        # Transforms anchors into proposals via bbox transformations.
        boxes = box_utils.batch_decode_box_outputs_op(anchor_boxes, boxes)

        # 2. clip proposals to image (may result in proposals with zero area
        # that will be removed in the next step)
        boxes = box_utils.clip_boxes(boxes, image_info[:, :2])

        # 3. remove predicted boxes with either height or width < min_size
        scores, boxes = box_utils.filter_boxes(scores, boxes, rpn_min_size,
                                               image_info)

        # 6. apply loose nms (e.g. threshold = 0.7)
        # 7. take after_nms_topN (e.g. 300)
        # 8. return the top proposals (-> RoIs top)
        post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn
                               else rpn_post_nms_topn)
        if rpn_nms_threshold > 0:
            scores, boxes = box_utils.sorted_non_max_suppression_padded(
                scores,
                boxes,
                max_output_size=post_nms_topk_limit,
                iou_threshold=rpn_nms_threshold)

        scores, boxes = box_utils.top_k(scores,
                                        k=post_nms_topk_limit,
                                        boxes_list=[boxes])
        boxes = boxes[0]
        return scores, boxes
Example #8
0
    def _forward_test(self, input):
        cnn_features = input
        arg = easydict.EasyDict({
            'clip_boxes': self.test_clip_boxes,
            'nms_thresh': self.test_nms_thresh,
            'max_proposals': self.test_max_proposals
        })

        # Make sure that setImageSize has been called
        assert self.image_height and self.image_width and not self._called_forward_size, \
         'Must call setImageSize before each forward pass'
        self._called_forward_size = True

        rpn_out, act_reg = self.rpn.forward(cnn_features)
        rpn_boxes, rpn_anchors, rpn_trans, rpn_scores = rpn_out
        num_boxes = rpn_boxes.size(1)

        # Maybe clip boxes to image boundary
        if arg.clip_boxes:
            bounds = {
                'x_min': 1,
                'y_min': 1,
                'x_max': self.image_width,
                'y_max': self.image_height
            }
            rpn_boxes, valid = box_utils.clip_boxes(rpn_boxes, bounds,
                                                    'xcycwh')

            #print(string.format('%d/%d boxes are predicted valid',
            #      torch.sum(valid), valid:nElement()))

            #Clamp parallel arrays only to valid boxes (not oob of the image)
            rpn_boxes = self.clamp_data(rpn_boxes, valid)
            rpn_anchors = self.clamp_data(rpn_anchors, valid)
            rpn_trans = self.clamp_data(rpn_trans, valid)
            rpn_scores = self.clamp_data(rpn_scores, valid)
            num_boxes = rpn_boxes.size(1)

        # Convert rpn boxes from (xc, yc, w, h) format to (x1, y1, x2, y2)
        rpn_boxes_x1y1x2y2 = box_utils.xcycwh_to_x1y1x2y2(rpn_boxes[0])

        # Convert objectness positive / negative scores to probabilities
        rpn_scores_exp = torch.exp(rpn_scores)
        pos_exp = rpn_scores_exp[0, :, 0]
        neg_exp = rpn_scores_exp[0, :, 1]
        scores = (pos_exp + neg_exp).pow(-1) * pos_exp

        verbose = False
        if verbose:
            print('in LocalizationLayer forward_test')
            print('Before NMS there are %d boxes' % num_boxes)
            print('Using NMS threshold %f' % arg.nms_thresh)

        #Run NMS and sort by objectness score
        boxes_scores = torch.cat((rpn_boxes_x1y1x2y2, scores.view(-1, 1)),
                                 dim=1)

        if arg.max_proposals == -1:
            idx = box_utils.nms(boxes_scores.data, arg.nms_thresh)
        else:
            idx = box_utils.nms(boxes_scores.data, arg.nms_thresh,
                                arg.max_proposals)

        rpn_boxes_nms = torch.squeeze(rpn_boxes)[idx]

        if verbose:
            print('After NMS there are %d boxes' % rpn_boxes_nms.size(0))

        output = rpn_boxes_nms
        return output
Example #9
0
def generate_detections_gpu(class_outputs,
                            box_outputs,
                            anchor_boxes,
                            image_id,
                            image_info,
                            pre_nms_num_detections=1000,
                            post_nms_num_detections=100,
                            nms_threshold=0.3,
                            bbox_reg_weights=(10., 10., 5., 5.)):
    """Generate the final detections given the model outputs (GPU version).

  Args:
    class_outputs: a tensor with shape [batch_size, N, num_classes], which
      stacks class logit outputs on all feature levels. The N is the number of
      total anchors on all levels. The num_classes is the number of classes
      predicted by the model. Note that the class_outputs here is the raw score.
    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
      stacks box regression outputs on all feature levels. The N is the number
      of total anchors on all levels.
    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
      on all feature levels. The N is the number of total anchors on all levels.
    image_id: a tensor with shape [batch_size] which specifies the image id of
      each image in the batch.
    image_info: a tensor of shape [batch_size, 5] which encodes each image's
      [height, width, scale, original_height, original_width].
    pre_nms_num_detections: an integer that specifies the number of candidates
      before NMS.
    post_nms_num_detections: an integer that specifies the number of candidates
      after NMS.
    nms_threshold: a float number to specify the IOU threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.

  Returns:
    detections: a tensor of [batch_size, post_nms_num_detections, 7], which
      stacks `post_nms_num_detections` number of detection results for each
      image in the batch. Each detection is stored in the format of
      [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension.
  """
    batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
    softmax_class_outputs = tf.nn.softmax(class_outputs)

    # Remove background
    scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
    boxes = tf.slice(
        tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
        [0, 0, 1, 0], [-1, -1, -1, -1])

    anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2),
                           [1, 1, num_classes - 1, 1])

    num_detections = num_boxes * (num_classes - 1)

    boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
    scores = tf.reshape(scores, [batch_size, num_detections, 1])
    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

    # Decode
    boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights)

    # Clip boxes
    height, width, scale = tf.split(image_info[:, :3],
                                    num_or_size_splits=3,
                                    axis=-1)
    height = tf.expand_dims(height, axis=-1)
    width = tf.expand_dims(width, axis=-1)
    scale = tf.expand_dims(scale, axis=-1)
    boxes = box_utils.clip_boxes(boxes, height, width)

    pre_nms_boxes = tf.reshape(boxes,
                               [batch_size, num_boxes, num_classes - 1, 4])
    pre_nms_scores = tf.reshape(scores,
                                [batch_size, num_boxes, num_classes - 1])

    # NMS
    pre_nms_boxes = box_utils.to_normalized_coordinates(
        pre_nms_boxes, height, width)
    post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = (
        tf.image.combined_non_max_suppression(
            pre_nms_boxes,
            pre_nms_scores,
            max_output_size_per_class=pre_nms_num_detections,
            max_total_size=post_nms_num_detections,
            iou_threshold=nms_threshold,
            score_threshold=0.0,
            pad_per_class=False))
    post_nms_classes = post_nms_classes + 1
    post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height,
                                                       width)

    # Only works with static batch size.
    # Unroll batch dimension.
    post_boxes_list = tf.unstack(post_nms_boxes)
    post_scores_list = tf.unstack(post_nms_scores)
    post_classes_list = tf.unstack(post_nms_classes)
    valid_boxes_list = tf.unstack(valid_boxes)
    image_id_list = tf.unstack(image_id)

    detections = []
    for boxes_i, scores_i, classes_i, _, image_id_i in (zip(
            post_boxes_list, post_scores_list, post_classes_list,
            valid_boxes_list, image_id_list)):
        post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections])
        post_nms_top_k_boxes = tf.reshape(boxes_i,
                                          [post_nms_num_detections, 4])
        post_nms_top_k_classes = tf.reshape(classes_i,
                                            [post_nms_num_detections])

        this_batch_detections = tf.stack([
            tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)),
            post_nms_top_k_boxes[:, 0],
            post_nms_top_k_boxes[:, 1],
            post_nms_top_k_boxes[:, 2],
            post_nms_top_k_boxes[:, 3],
            post_nms_top_k_scores,
            tf.to_float(post_nms_top_k_classes),
        ],
                                         axis=1)
        detections.append(this_batch_detections)
    detections = tf.stack(detections, axis=0)
    return detections
Example #10
0
def generate_detections_per_image_op(
    cls_outputs, box_outputs, anchor_boxes, image_id, image_info,
    num_detections=100, pre_nms_num_detections=1000, nms_threshold=0.3,
    bbox_reg_weights=(10., 10., 5., 5.)):
  """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a Tensor with shape [N, num_classes], which stacks class
      logit outputs on all feature levels. The N is the number of total anchors
      on all levels. The num_classes is the number of classes predicted by the
      model. Note that the cls_outputs should be the output of softmax().
    box_outputs: a Tensor with shape [N, num_classes*4], which stacks
      box regression outputs on all feature levels. The N is the number of total
      anchors on all levels.
    anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of total anchors on all levels.
    image_id: an integer number to specify the image id.
    image_info: a tensor of shape [5] which encodes the input image's [height,
      width, scale, original_height, original_width]
    num_detections: Number of detections after NMS.
    pre_nms_num_detections: Number of candidates before NMS.
    nms_threshold: a float number to specify the threshold of NMS.
    bbox_reg_weights: a list of 4 float scalars, which are default weights on
      (dx, dy, dw, dh) for normalizing bbox regression targets.
  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, ymin, xmin, ymax, xmax, score, class]
  """
  num_boxes, num_classes = cls_outputs.get_shape().as_list()

  # Removes background class scores.
  cls_outputs = cls_outputs[:, 1:num_classes]
  top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
      tf.reshape(cls_outputs, [-1]),
      k=pre_nms_num_detections,
      sorted=True)
  classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
  top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)

  anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
  box_outputs = tf.reshape(
      box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
  box_outputs = tf.gather_nd(box_outputs,
                             tf.stack([top_k_indices, classes], axis=1))

  # Applies bounding box regression to anchors.
  boxes = box_utils.batch_decode_box_outputs_op(
      tf.expand_dims(anchor_boxes, axis=0),
      tf.expand_dims(box_outputs, axis=0),
      bbox_reg_weights)[0]
  boxes = box_utils.clip_boxes(
      tf.expand_dims(boxes, axis=0), tf.expand_dims(image_info[:2], axis=0))[0]

  classes = tf.tile(tf.reshape(classes, [1, pre_nms_num_detections]),
                    [num_classes - 1, 1])
  scores = tf.tile(tf.reshape(top_k_scores, [1, pre_nms_num_detections]),
                   [num_classes - 1, 1])
  boxes = tf.tile(tf.reshape(boxes, [1, pre_nms_num_detections, 4]),
                  [num_classes - 1, 1, 1])

  class_bitmask = tf.tile(
      tf.reshape(tf.range(num_classes-1), [num_classes - 1, 1]),
      [1, pre_nms_num_detections])
  scores = tf.where(tf.equal(classes, class_bitmask), scores,
                    tf.zeros_like(scores))
  scores = tf.where(tf.greater(scores, 0.05), scores, tf.zeros_like(scores))
  # Reshape classes to be compartible with the top_k function.
  classes = tf.reshape(classes, [num_classes -1, pre_nms_num_detections, 1])
  scores, sorted_tensors = box_utils.top_k(
      scores, k=pre_nms_num_detections, tensors=[boxes, classes])
  boxes = sorted_tensors[0]
  classes = tf.reshape(sorted_tensors[1],
                       [num_classes - 1, pre_nms_num_detections])

  (post_nms_scores,
   post_nms_boxes, idx) = non_max_suppression.non_max_suppression_padded(
       scores, boxes, max_output_size=num_detections,
       iou_threshold=nms_threshold, level=0)

  # Sorts all results.
  sorted_scores, sorted_indices = tf.nn.top_k(
      tf.to_float(tf.reshape(post_nms_scores, [-1])),
      k=num_detections,
      sorted=True)
  post_nms_boxes = tf.gather(tf.reshape(post_nms_boxes, [-1, 4]),
                             sorted_indices)
  classes = tf.batch_gather(classes, idx)
  post_nms_classes = tf.gather(tf.reshape(classes, [-1]), sorted_indices) + 1

  if isinstance(image_id, int):
    image_id = tf.constant(image_id)
  image_id = tf.reshape(image_id, [])
  detections_result = tf.stack(
      [
          tf.to_float(tf.fill(tf.shape(sorted_scores), image_id)),
          post_nms_boxes[:, 0],
          post_nms_boxes[:, 1],
          post_nms_boxes[:, 2],
          post_nms_boxes[:, 3],
          sorted_scores,
          tf.to_float(post_nms_classes),
      ],
      axis=1)
  return detections_result