Ejemplo n.º 1
0
def resize_and_crop_boxes(boxes,
                          image_scale,
                          output_size,
                          offset):
  """Resizes boxes to output size with scale and offset.

  Args:
    boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
    image_scale: 2D float `Tensor` representing scale factors that apply to
      [height, width] of input image.
    output_size: 2D `Tensor` or `int` representing [height, width] of target
      output image size.
    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
      boxes.

  Returns:
    boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
  """
  with tf.name_scope('resize_and_crop_boxes'):
    # Adjusts box coordinates based on image_scale and offset.
    boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
    boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
    # Clips the boxes.
    boxes = box_ops.clip_boxes(boxes, output_size)
    return boxes
Ejemplo n.º 2
0
def resize_and_crop_boxes(boxes, image_scale, output_size, offset,
                          box_history):
    """Resizes and crops the boxes.

  Args:
    boxes: A `Tensor` for the boxes.
    image_scale: A `Tensor` for the scaling factor of the image.
    output_size: A `list` of two integers, a two-element vector or a tensor such
      that all but the last dimensions are `broadcastable` to `boxes`. The last
      dimension is 2, which represents [height, width].
    offset: A `Tensor` for how much translation was applied to the image.
    box_history: A `Tensor` for the boxes history, which are the boxes that
      undergo the same augmentations as `boxes`, but no clipping was applied. We
      can keep track of how much changes are done to the boxes by keeping track
      of this tensor.

  Returns:
    clipped_boxes: A `Tensor` representing the augmented boxes.
    box_history: A `Tensor` representing the augmented box_history.
  """

    # Shift and scale the input boxes.
    boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
    boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])

    # Check the hitory of the boxes.
    box_history *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
    box_history -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])

    # Clip the shifted and scaled boxes.
    clipped_boxes = bbox_ops.clip_boxes(boxes, output_size)
    return clipped_boxes, box_history
Ejemplo n.º 3
0
    def _mosaic_crop_image(self, image, boxes, classes, is_crowd, area):
        """Process a patched image in preperation for final output."""
        if self._mosaic_crop_mode != 'crop':
            shape = tf.cast(preprocessing_ops.get_image_shape(image),
                            tf.float32)
            center = shape * self._mosaic_center

            # shift the center of the image by applying a translation to the whole
            # image
            ch = tf.math.round(
                preprocessing_ops.random_uniform_strong(-center[0],
                                                        center[0],
                                                        seed=self._seed))
            cw = tf.math.round(
                preprocessing_ops.random_uniform_strong(-center[1],
                                                        center[1],
                                                        seed=self._seed))

            # clip the boxes to those with in the image
            image = tfa.image.translate(image, [cw, ch],
                                        fill_value=self._pad_value)
            boxes = box_ops.denormalize_boxes(boxes, shape[:2])
            boxes = boxes + tf.cast([ch, cw, ch, cw], boxes.dtype)
            boxes = box_ops.clip_boxes(boxes, shape[:2])
            inds = box_ops.get_non_empty_box_indices(boxes)

            boxes = box_ops.normalize_boxes(boxes, shape[:2])
            boxes, classes, is_crowd, area = self._select_ind(
                inds,
                boxes,
                classes,  # pylint:disable=unbalanced-tuple-unpacking
                is_crowd,
                area)

        # warp and scale the fully stitched sample
        image, _, affine = preprocessing_ops.affine_warp_image(
            image, [self._output_size[0], self._output_size[1]],
            scale_min=self._aug_scale_min,
            scale_max=self._aug_scale_max,
            translate=self._aug_rand_translate,
            degrees=self._aug_rand_angle,
            perspective=self._aug_rand_perspective,
            random_pad=self._random_pad,
            seed=self._seed)
        height, width = self._output_size[0], self._output_size[1]
        image = tf.image.resize(image, (height, width))

        # clip and clean boxes
        boxes, inds = preprocessing_ops.transform_and_clip_boxes(
            boxes,
            None,
            affine=affine,
            area_thresh=self._area_thresh,
            seed=self._seed)
        classes, is_crowd, area = self._select_ind(inds, classes, is_crowd,
                                                   area)  # pylint:disable=unbalanced-tuple-unpacking
        return image, boxes, classes, is_crowd, area, area
Ejemplo n.º 4
0
  def _reorg_boxes(self, boxes, info, num_detections):
    """Scale and Clean boxes prior to Evaluation."""
    mask = tf.sequence_mask(num_detections, maxlen=tf.shape(boxes)[1])
    mask = tf.cast(tf.expand_dims(mask, axis=-1), boxes.dtype)

    # Denormalize the boxes by the shape of the image
    inshape = tf.expand_dims(info[:, 1, :], axis=1)
    ogshape = tf.expand_dims(info[:, 0, :], axis=1)
    scale = tf.expand_dims(info[:, 2, :], axis=1)
    offset = tf.expand_dims(info[:, 3, :], axis=1)

    boxes = box_ops.denormalize_boxes(boxes, inshape)
    boxes = box_ops.clip_boxes(boxes, inshape)
    boxes += tf.tile(offset, [1, 1, 2])
    boxes /= tf.tile(scale, [1, 1, 2])
    boxes = box_ops.clip_boxes(boxes, ogshape)

    # Mask the boxes for usage
    boxes *= mask
    boxes += (mask - 1)
    return boxes
Ejemplo n.º 5
0
def affine_warp_boxes(affine, boxes, output_size, box_history):
    """Applies random rotation, random perspective change and random translation.

  and random scaling to the boxes.

  Args:
    affine: A `Tensor` for the augmenting matrix for the boxes.
    boxes: A `Tensor` for the boxes.
    output_size: A `list` of two integers, a two-element vector or a tensor such
      that all but the last dimensions are `broadcastable` to `boxes`. The last
      dimension is 2, which represents [height, width].
    box_history: A `Tensor` for the boxes history, which are the boxes that
      undergo the same augmentations as `boxes`, but no clipping was applied. We
      can keep track of how much changes are done to the boxes by keeping track
      of this tensor.

  Returns:
    clipped_boxes: A `Tensor` representing the augmented boxes.
    box_history: A `Tensor` representing the augmented box_history.
  """
    def _get_corners(box):
        """Get the corner of each box as a tuple of (x, y) coordinates."""
        ymi, xmi, yma, xma = tf.split(box, 4, axis=-1)
        tl = tf.concat([xmi, ymi], axis=-1)
        bl = tf.concat([xmi, yma], axis=-1)
        tr = tf.concat([xma, ymi], axis=-1)
        br = tf.concat([xma, yma], axis=-1)
        return tf.concat([tl, bl, tr, br], axis=-1)

    def _corners_to_boxes(corner):
        """Convert (x, y) corners back into boxes [ymin, xmin, ymax, xmax]."""
        corner = tf.reshape(corner, [-1, 4, 2])
        y = corner[..., 1]
        x = corner[..., 0]
        y_min = tf.reduce_min(y, axis=-1)
        x_min = tf.reduce_min(x, axis=-1)
        y_max = tf.reduce_max(y, axis=-1)
        x_max = tf.reduce_max(x, axis=-1)
        return tf.stack([y_min, x_min, y_max, x_max], axis=-1)

    def _aug_boxes(affine_matrix, box):
        """Apply an affine transformation matrix M to the boxes augmente boxes."""
        corners = _get_corners(box)
        corners = tf.reshape(corners, [-1, 4, 2])
        z = tf.expand_dims(tf.ones_like(corners[..., 1]), axis=-1)
        corners = tf.concat([corners, z], axis=-1)

        corners = tf.transpose(tf.matmul(affine_matrix,
                                         corners,
                                         transpose_b=True),
                               perm=(0, 2, 1))

        corners, p = tf.split(corners, [2, 1], axis=-1)
        corners /= p
        corners = tf.reshape(corners, [-1, 8])
        box = _corners_to_boxes(corners)
        return box

    boxes = _aug_boxes(affine, boxes)
    box_history = _aug_boxes(affine, box_history)

    clipped_boxes = bbox_ops.clip_boxes(boxes, output_size)
    return clipped_boxes, box_history
Ejemplo n.º 6
0
    def _call_box_outputs(
        self,
        images: tf.Tensor,
        image_shape: tf.Tensor,
        anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
        gt_boxes: Optional[tf.Tensor] = None,
        gt_classes: Optional[tf.Tensor] = None,
        training: Optional[bool] = None
    ) -> Tuple[Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
        """Implementation of the Faster-RCNN logic for boxes."""
        model_outputs = {}

        # Feature extraction.
        (backbone_features,
         decoder_features) = self._get_backbone_and_decoder_features(images)

        # Region proposal network.
        rpn_scores, rpn_boxes = self.rpn_head(decoder_features)

        model_outputs.update({
            'backbone_features': backbone_features,
            'decoder_features': decoder_features,
            'rpn_boxes': rpn_boxes,
            'rpn_scores': rpn_scores
        })

        # Generate anchor boxes for this batch if not provided.
        if anchor_boxes is None:
            _, image_height, image_width, _ = images.get_shape().as_list()
            anchor_boxes = anchor.Anchor(
                min_level=self._config_dict['min_level'],
                max_level=self._config_dict['max_level'],
                num_scales=self._config_dict['num_scales'],
                aspect_ratios=self._config_dict['aspect_ratios'],
                anchor_size=self._config_dict['anchor_size'],
                image_size=(image_height, image_width)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0),
                    [tf.shape(images)[0], 1, 1, 1])

        # Generate RoIs.
        current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores,
                                             anchor_boxes, image_shape,
                                             training)

        next_rois = current_rois
        all_class_outputs = []
        for cascade_num in range(len(self.roi_sampler)):
            # In cascade RCNN we want the higher layers to have different regression
            # weights as the predicted deltas become smaller and smaller.
            regression_weights = self._cascade_layer_to_weights[cascade_num]
            current_rois = next_rois

            (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
             matched_gt_classes, matched_gt_indices,
             current_rois) = self._run_frcnn_head(
                 features=decoder_features,
                 rois=current_rois,
                 gt_boxes=gt_boxes,
                 gt_classes=gt_classes,
                 training=training,
                 model_outputs=model_outputs,
                 cascade_num=cascade_num,
                 regression_weights=regression_weights)
            all_class_outputs.append(class_outputs)

            # Generate ROIs for the next cascade head if there is any.
            if cascade_num < len(self.roi_sampler) - 1:
                next_rois = box_ops.decode_boxes(tf.cast(
                    box_outputs, tf.float32),
                                                 current_rois,
                                                 weights=regression_weights)
                next_rois = box_ops.clip_boxes(
                    next_rois, tf.expand_dims(image_shape, axis=1))

        if not training:
            if self._config_dict['cascade_class_ensemble']:
                class_outputs = tf.add_n(all_class_outputs) / len(
                    all_class_outputs)

            detections = self.detection_generator(
                box_outputs,
                class_outputs,
                current_rois,
                image_shape,
                regression_weights,
                bbox_per_class=(
                    not self._config_dict['class_agnostic_bbox_pred']))
            model_outputs.update({
                'cls_outputs': class_outputs,
                'box_outputs': box_outputs,
            })
            if self.detection_generator.get_config()['apply_nms']:
                model_outputs.update({
                    'detection_boxes':
                    detections['detection_boxes'],
                    'detection_scores':
                    detections['detection_scores'],
                    'detection_classes':
                    detections['detection_classes'],
                    'num_detections':
                    detections['num_detections']
                })
            else:
                model_outputs.update({
                    'decoded_boxes':
                    detections['decoded_boxes'],
                    'decoded_box_scores':
                    detections['decoded_box_scores']
                })

        intermediate_outputs = {
            'matched_gt_boxes': matched_gt_boxes,
            'matched_gt_indices': matched_gt_indices,
            'matched_gt_classes': matched_gt_classes,
            'current_rois': current_rois,
        }
        return (model_outputs, intermediate_outputs)
Ejemplo n.º 7
0
  def __call__(self,
               raw_boxes: Mapping[str, tf.Tensor],
               raw_scores: Mapping[str, tf.Tensor],
               anchor_boxes: tf.Tensor,
               image_shape: tf.Tensor,
               raw_attributes: Mapping[str, tf.Tensor] = None):
    """Generates final detections.

    Args:
      raw_boxes: A `dict` with keys representing FPN levels and values
        representing box tenors of shape `[batch, feature_h, feature_w,
        num_anchors * 4]`.
      raw_scores: A `dict` with keys representing FPN levels and values
        representing logit tensors of shape `[batch, feature_h, feature_w,
        num_anchors]`.
      anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing
        the corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image
        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.
      raw_attributes: If not None, a `dict` of (attribute_name,
        attribute_prediction) pairs. `attribute_prediction` is a dict that
        contains keys representing FPN levels and values representing tenors of
        shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
        `detection_boxes`: A `float` tf.Tensor of shape
          [batch, max_num_detections, 4] representing top detected boxes in
          [y1, x1, y2, x2].
        `detection_scores`: A `float` tf.Tensor of shape
          [batch, max_num_detections] representing sorted confidence scores for
          detected boxes. The values are between [0, 1].
        `detection_classes`: An `int` tf.Tensor of shape
          [batch, max_num_detections] representing classes for detected boxes.
        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
        `detection_attributes`: A dict. Values of the dict is a `float`
          tf.Tensor of shape [batch, max_num_detections, attribute_size]
          representing attribute predictions for detected boxes.
      If `apply_nms` = False, the return is a dictionary with keys:
        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
        `decoded_box_scores`: A `float` tf.Tensor of shape
          [batch, num_raw_boxes] representing socres of all the decoded boxes.
        `decoded_box_attributes`: A dict. Values in the dict is a
          `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size]
          representing attribute predictions of all the decoded boxes.
    """
    # Collects outputs from all levels into a list.
    boxes = []
    scores = []
    if raw_attributes:
      attributes = {att_name: [] for att_name in raw_attributes.keys()}
    else:
      attributes = {}

    levels = list(raw_boxes.keys())
    min_level = int(min(levels))
    max_level = int(max(levels))
    for i in range(min_level, max_level + 1):
      raw_boxes_i_shape = tf.shape(raw_boxes[str(i)])
      batch_size = raw_boxes_i_shape[0]
      num_anchors_per_locations = raw_boxes_i_shape[-1] // 4
      num_classes = tf.shape(
          raw_scores[str(i)])[-1] // num_anchors_per_locations

      # Applies score transformation and remove the implicit background class.
      scores_i = tf.sigmoid(
          tf.reshape(raw_scores[str(i)], [batch_size, -1, num_classes]))
      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])

      # Box decoding.
      # The anchor boxes are shared for all data in a batch.
      # One stage detector only supports class agnostic box regression.
      anchor_boxes_i = tf.reshape(anchor_boxes[str(i)], [batch_size, -1, 4])
      raw_boxes_i = tf.reshape(raw_boxes[str(i)], [batch_size, -1, 4])
      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)

      # Box clipping.
      boxes_i = box_ops.clip_boxes(
          boxes_i, tf.expand_dims(image_shape, axis=1))

      boxes.append(boxes_i)
      scores.append(scores_i)

      if raw_attributes:
        for att_name, raw_att in raw_attributes.items():
          attribute_size = tf.shape(
              raw_att[str(i)])[-1] // num_anchors_per_locations
          att_i = tf.reshape(raw_att[str(i)], [batch_size, -1, attribute_size])
          attributes[att_name].append(att_i)

    boxes = tf.concat(boxes, axis=1)
    boxes = tf.expand_dims(boxes, axis=2)
    scores = tf.concat(scores, axis=1)

    if raw_attributes:
      for att_name in raw_attributes.keys():
        attributes[att_name] = tf.concat(attributes[att_name], axis=1)
        attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)

    if not self._config_dict['apply_nms']:
      return {
          'decoded_boxes': boxes,
          'decoded_box_scores': scores,
          'decoded_box_attributes': attributes,
      }

    if self._config_dict['use_batched_nms']:
      if raw_attributes:
        raise ValueError('Attribute learning is not supported for batched NMS.')

      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
          _generate_detections_batched(
              boxes,
              scores,
              self._config_dict['pre_nms_score_threshold'],
              self._config_dict['nms_iou_threshold'],
              self._config_dict['max_num_detections']))
      # Set `nmsed_attributes` to None for batched NMS.
      nmsed_attributes = {}
    else:
      if raw_attributes:
        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes = (
            _generate_detections_v1(
                boxes,
                scores,
                attributes=attributes if raw_attributes else None,
                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
                pre_nms_score_threshold=self
                ._config_dict['pre_nms_score_threshold'],
                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
                max_num_detections=self._config_dict['max_num_detections']))
      else:
        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
            _generate_detections_v2(
                boxes, scores, self._config_dict['pre_nms_top_k'],
                self._config_dict['pre_nms_score_threshold'],
                self._config_dict['nms_iou_threshold'],
                self._config_dict['max_num_detections']))
        nmsed_attributes = {}
    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1

    return {
        'num_detections': valid_detections,
        'detection_boxes': nmsed_boxes,
        'detection_classes': nmsed_classes,
        'detection_scores': nmsed_scores,
        'detection_attributes': nmsed_attributes,
    }
Ejemplo n.º 8
0
  def __call__(self, raw_boxes: tf.Tensor, raw_scores: tf.Tensor,
               anchor_boxes: tf.Tensor, image_shape: tf.Tensor):
    """Generates final detections.

    Args:
      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
        representing the class-specific box coordinates relative to anchors.
      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
        representing the class logits before applying score activiation.
      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
        the corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
        `detection_boxes`: A `float` tf.Tensor of shape
          [batch, max_num_detections, 4] representing top detected boxes in
          [y1, x1, y2, x2].
        `detection_scores`: A `float` `tf.Tensor` of shape
          [batch, max_num_detections] representing sorted confidence scores for
          detected boxes. The values are between [0, 1].
        `detection_classes`: An `int` tf.Tensor of shape
          [batch, max_num_detections] representing classes for detected boxes.
        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
        `decoded_box_scores`: A `float` tf.Tensor of shape
          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    box_scores = tf.nn.softmax(raw_scores, axis=-1)

    # Removes the background class.
    box_scores_shape = tf.shape(box_scores)
    box_scores_shape_list = box_scores.get_shape().as_list()
    batch_size = box_scores_shape[0]
    num_locations = box_scores_shape_list[1]
    num_classes = box_scores_shape_list[-1]
    num_detections = num_locations * (num_classes - 1)

    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])
    raw_boxes = tf.reshape(raw_boxes,
                           [batch_size, num_locations, num_classes, 4])
    raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
    anchor_boxes = tf.tile(
        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
    raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

    # Box decoding.
    decoded_boxes = box_ops.decode_boxes(
        raw_boxes, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])

    # Box clipping
    decoded_boxes = box_ops.clip_boxes(
        decoded_boxes, tf.expand_dims(image_shape, axis=1))

    decoded_boxes = tf.reshape(decoded_boxes,
                               [batch_size, num_locations, num_classes - 1, 4])

    if not self._config_dict['apply_nms']:
      return {
          'decoded_boxes': decoded_boxes,
          'decoded_box_scores': box_scores,
      }

    if self._config_dict['use_batched_nms']:
      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
          _generate_detections_batched(
              decoded_boxes,
              box_scores,
              self._config_dict['pre_nms_score_threshold'],
              self._config_dict['nms_iou_threshold'],
              self._config_dict['max_num_detections']))
    else:
      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
          _generate_detections_v2(
              decoded_boxes,
              box_scores,
              self._config_dict['pre_nms_top_k'],
              self._config_dict['pre_nms_score_threshold'],
              self._config_dict['nms_iou_threshold'],
              self._config_dict['max_num_detections']))

    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1

    return {
        'num_detections': valid_detections,
        'detection_boxes': nmsed_boxes,
        'detection_classes': nmsed_classes,
        'detection_scores': nmsed_scores,
    }
Ejemplo n.º 9
0
    def call(self,
             images: tf.Tensor,
             image_shape: tf.Tensor,
             anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
             gt_boxes: tf.Tensor = None,
             gt_classes: tf.Tensor = None,
             gt_masks: tf.Tensor = None,
             training: bool = None) -> Mapping[str, tf.Tensor]:
        model_outputs = {}

        # Feature extraction.
        features = self.backbone(images)
        if self.decoder:
            features = self.decoder(features)

        # Region proposal network.
        rpn_scores, rpn_boxes = self.rpn_head(features)

        model_outputs.update({
            'rpn_boxes': rpn_boxes,
            'rpn_scores': rpn_scores
        })

        # Generate anchor boxes for this batch if not provided.
        if anchor_boxes is None:
            _, image_height, image_width, _ = images.get_shape().as_list()
            anchor_boxes = anchor.Anchor(
                min_level=self._config_dict['min_level'],
                max_level=self._config_dict['max_level'],
                num_scales=self._config_dict['num_scales'],
                aspect_ratios=self._config_dict['aspect_ratios'],
                anchor_size=self._config_dict['anchor_size'],
                image_size=(image_height, image_width)).multilevel_boxes
            for l in anchor_boxes:
                anchor_boxes[l] = tf.tile(
                    tf.expand_dims(anchor_boxes[l], axis=0),
                    [tf.shape(images)[0], 1, 1, 1])

        # Generate RoIs.
        current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores,
                                             anchor_boxes, image_shape,
                                             training)

        next_rois = current_rois
        all_class_outputs = []
        for cascade_num in range(len(self.roi_sampler)):
            # In cascade RCNN we want the higher layers to have different regression
            # weights as the predicted deltas become smaller and smaller.
            regression_weights = self._cascade_layer_to_weights[cascade_num]
            current_rois = next_rois

            (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
             matched_gt_classes, matched_gt_indices,
             current_rois) = self._run_frcnn_head(
                 features=features,
                 rois=current_rois,
                 gt_boxes=gt_boxes,
                 gt_classes=gt_classes,
                 training=training,
                 model_outputs=model_outputs,
                 layer_num=cascade_num,
                 regression_weights=regression_weights)
            all_class_outputs.append(class_outputs)

            # Generate ROIs for the next cascade head if there is any.
            if cascade_num < len(self.roi_sampler) - 1:
                next_rois = box_ops.decode_boxes(tf.cast(
                    box_outputs, tf.float32),
                                                 current_rois,
                                                 weights=regression_weights)
                next_rois = box_ops.clip_boxes(
                    next_rois, tf.expand_dims(image_shape, axis=1))

        if not training:
            if self._config_dict['cascade_class_ensemble']:
                class_outputs = tf.add_n(all_class_outputs) / len(
                    all_class_outputs)

            detections = self.detection_generator(
                box_outputs,
                class_outputs,
                current_rois,
                image_shape,
                regression_weights,
                bbox_per_class=(
                    not self._config_dict['class_agnostic_bbox_pred']))
            model_outputs.update({
                'detection_boxes':
                detections['detection_boxes'],
                'detection_scores':
                detections['detection_scores'],
                'detection_classes':
                detections['detection_classes'],
                'num_detections':
                detections['num_detections'],
            })

        if not self._include_mask:
            return model_outputs

        if training:
            current_rois, roi_classes, roi_masks = self.mask_sampler(
                current_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, gt_masks)
            roi_masks = tf.stop_gradient(roi_masks)

            model_outputs.update({
                'mask_class_targets': roi_classes,
                'mask_targets': roi_masks,
            })
        else:
            current_rois = model_outputs['detection_boxes']
            roi_classes = model_outputs['detection_classes']

        # Mask RoI align.
        mask_roi_features = self.mask_roi_aligner(features, current_rois)

        # Mask head.
        raw_masks = self.mask_head([mask_roi_features, roi_classes])

        if training:
            model_outputs.update({
                'mask_outputs': raw_masks,
            })
        else:
            model_outputs.update({
                'detection_masks': tf.math.sigmoid(raw_masks),
            })
        return model_outputs
Ejemplo n.º 10
0
def _multilevel_propose_rois(raw_boxes,
                             raw_scores,
                             anchor_boxes,
                             image_shape,
                             pre_nms_top_k=2000,
                             pre_nms_score_threshold=0.0,
                             pre_nms_min_size_threshold=0.0,
                             nms_iou_threshold=0.7,
                             num_proposals=1000,
                             use_batched_nms=False,
                             decode_boxes=True,
                             clip_boxes=True,
                             apply_sigmoid_to_score=True):
    """Proposes RoIs given a group of candidates from different FPN levels.

  The following describes the steps:
    1. For each individual level:
      a. Apply sigmoid transform if specified.
      b. Decode boxes if specified.
      c. Clip boxes if specified.
      d. Filter small boxes and those fall outside image if specified.
      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
      f. Apply NMS.
    2. Aggregate post-NMS boxes from each level.
    3. Apply an overall top k to generate the final selected RoIs.

  Args:
    raw_boxes: A `dict` with keys representing FPN levels and values
      representing box tenors of shape
      [batch_size, feature_h, feature_w, num_anchors * 4].
    raw_scores: A `dict` with keys representing FPN levels and values
      representing logit tensors of shape
      [batch_size, feature_h, feature_w, num_anchors].
    anchor_boxes: A `dict` with keys representing FPN levels and values
      representing anchor box tensors of shape
      [batch_size, feature_h * feature_w * num_anchors, 4].
    image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
      are [height, width] of the scaled image.
    pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
      before applying NMS. Default: 2000.
    pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
      box score to keep before applying NMS. This is often used as a
      pre-filtering step for better performance. Default: 0, no filtering is
      applied.
    pre_nms_min_size_threshold: A `float` representing the minimal box size in
      each side (w.r.t. the scaled image) to keep before applying NMS. This is
      often used as a pre-filtering step for better performance. Default: 0, no
      filtering is applied.
    nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
    num_proposals: An `int` of top scoring RPN proposals *in total* to keep
      after applying NMS. Default: 1000.
    use_batched_nms: A `bool` indicating whether NMS is applied in batch using
      `tf.image.combined_non_max_suppression`. Currently only available in
      CPU/GPU. Default is False.
    decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
      using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
      `anchor_boxes`. Default is True.
    clip_boxes: A `bool` indicating whether boxes are first clipped to the
      scaled image size before appliying NMS. If False, no clipping is applied
      and `image_shape` is ignored. Default is True.
    apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
      `raw_scores` before applying NMS. Default is True.

  Returns:
    selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
      representing the box coordinates of the selected proposals w.r.t. the
      scaled image.
    selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
      representing the scores of the selected proposals.
  """
    with tf.name_scope('multilevel_propose_rois'):
        rois = []
        roi_scores = []
        image_shape = tf.expand_dims(image_shape, axis=1)
        for level in sorted(raw_scores.keys()):
            with tf.name_scope('level_%s' % level):
                _, feature_h, feature_w, num_anchors_per_location = (
                    raw_scores[level].get_shape().as_list())

                num_boxes = feature_h * feature_w * num_anchors_per_location
                this_level_scores = tf.reshape(raw_scores[level],
                                               [-1, num_boxes])
                this_level_boxes = tf.reshape(raw_boxes[level],
                                              [-1, num_boxes, 4])
                this_level_anchors = tf.cast(tf.reshape(
                    anchor_boxes[level], [-1, num_boxes, 4]),
                                             dtype=this_level_scores.dtype)

                if apply_sigmoid_to_score:
                    this_level_scores = tf.sigmoid(this_level_scores)

                if decode_boxes:
                    this_level_boxes = box_ops.decode_boxes(
                        this_level_boxes, this_level_anchors)
                if clip_boxes:
                    this_level_boxes = box_ops.clip_boxes(
                        this_level_boxes, image_shape)

                if pre_nms_min_size_threshold > 0.0:
                    this_level_boxes, this_level_scores = box_ops.filter_boxes(
                        this_level_boxes, this_level_scores, image_shape,
                        pre_nms_min_size_threshold)

                this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
                this_level_post_nms_top_k = min(num_boxes, num_proposals)
                if nms_iou_threshold > 0.0:
                    if use_batched_nms:
                        this_level_rois, this_level_roi_scores, _, _ = (
                            tf.image.combined_non_max_suppression(
                                tf.expand_dims(this_level_boxes, axis=2),
                                tf.expand_dims(this_level_scores, axis=-1),
                                max_output_size_per_class=
                                this_level_pre_nms_top_k,
                                max_total_size=this_level_post_nms_top_k,
                                iou_threshold=nms_iou_threshold,
                                score_threshold=pre_nms_score_threshold,
                                pad_per_class=False,
                                clip_boxes=False))
                    else:
                        if pre_nms_score_threshold > 0.0:
                            this_level_boxes, this_level_scores = (
                                box_ops.filter_boxes_by_scores(
                                    this_level_boxes, this_level_scores,
                                    pre_nms_score_threshold))
                        this_level_boxes, this_level_scores = box_ops.top_k_boxes(
                            this_level_boxes,
                            this_level_scores,
                            k=this_level_pre_nms_top_k)
                        this_level_roi_scores, this_level_rois = (
                            nms.sorted_non_max_suppression_padded(
                                this_level_scores,
                                this_level_boxes,
                                max_output_size=this_level_post_nms_top_k,
                                iou_threshold=nms_iou_threshold))
                else:
                    this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
                        this_level_boxes,
                        this_level_scores,
                        k=this_level_post_nms_top_k)

                rois.append(this_level_rois)
                roi_scores.append(this_level_roi_scores)

        all_rois = tf.concat(rois, axis=1)
        all_roi_scores = tf.concat(roi_scores, axis=1)

        with tf.name_scope('top_k_rois'):
            _, num_valid_rois = all_roi_scores.get_shape().as_list()
            overall_top_k = min(num_valid_rois, num_proposals)

            selected_rois, selected_roi_scores = box_ops.top_k_boxes(
                all_rois, all_roi_scores, k=overall_top_k)

        return selected_rois, selected_roi_scores
Ejemplo n.º 11
0
  def _decode_multilevel_outputs(
      self,
      raw_boxes: Mapping[str, tf.Tensor],
      raw_scores: Mapping[str, tf.Tensor],
      anchor_boxes: tf.Tensor,
      image_shape: tf.Tensor,
      raw_attributes: Optional[Mapping[str, tf.Tensor]] = None):
    """Collects dict of multilevel boxes, scores, attributes into lists."""
    boxes = []
    scores = []
    if raw_attributes:
      attributes = {att_name: [] for att_name in raw_attributes.keys()}
    else:
      attributes = {}

    levels = list(raw_boxes.keys())
    min_level = int(min(levels))
    max_level = int(max(levels))
    for i in range(min_level, max_level + 1):
      raw_boxes_i = raw_boxes[str(i)]
      raw_scores_i = raw_scores[str(i)]
      batch_size = tf.shape(raw_boxes_i)[0]
      (_, feature_h_i, feature_w_i,
       num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list()
      num_locations = feature_h_i * feature_w_i
      num_anchors_per_locations = num_anchors_per_locations_times_4 // 4
      num_classes = raw_scores_i.get_shape().as_list(
      )[-1] // num_anchors_per_locations

      # Applies score transformation and remove the implicit background class.
      scores_i = tf.sigmoid(
          tf.reshape(raw_scores_i, [
              batch_size, num_locations * num_anchors_per_locations, num_classes
          ]))
      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])

      # Box decoding.
      # The anchor boxes are shared for all data in a batch.
      # One stage detector only supports class agnostic box regression.
      anchor_boxes_i = tf.reshape(
          anchor_boxes[str(i)],
          [batch_size, num_locations * num_anchors_per_locations, 4])
      raw_boxes_i = tf.reshape(
          raw_boxes_i,
          [batch_size, num_locations * num_anchors_per_locations, 4])
      boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)

      # Box clipping.
      boxes_i = box_ops.clip_boxes(
          boxes_i, tf.expand_dims(image_shape, axis=1))

      boxes.append(boxes_i)
      scores.append(scores_i)

      if raw_attributes:
        for att_name, raw_att in raw_attributes.items():
          attribute_size = raw_att[str(
              i)].get_shape().as_list()[-1] // num_anchors_per_locations
          att_i = tf.reshape(raw_att[str(i)], [
              batch_size, num_locations * num_anchors_per_locations,
              attribute_size
          ])
          attributes[att_name].append(att_i)

    boxes = tf.concat(boxes, axis=1)
    boxes = tf.expand_dims(boxes, axis=2)
    scores = tf.concat(scores, axis=1)

    if raw_attributes:
      for att_name in raw_attributes.keys():
        attributes[att_name] = tf.concat(attributes[att_name], axis=1)
        attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2)

    return boxes, scores, attributes
Ejemplo n.º 12
0
  def __call__(self,
               raw_boxes: tf.Tensor,
               raw_scores: tf.Tensor,
               anchor_boxes: tf.Tensor,
               image_shape: tf.Tensor,
               regression_weights: Optional[List[float]] = None,
               bbox_per_class: bool = True):
    """Generates final detections.

    Args:
      raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]`
        representing the class-specific box coordinates relative to anchors.
      raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]`
        representing the class logits before applying score activiation.
      anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing
        the corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image
        height and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.
      regression_weights: A list of four float numbers to scale coordinates.
      bbox_per_class: A `bool`. If True, perform per-class box regression.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
        `detection_boxes`: A `float` tf.Tensor of shape
          [batch, max_num_detections, 4] representing top detected boxes in
          [y1, x1, y2, x2].
        `detection_scores`: A `float` `tf.Tensor` of shape
          [batch, max_num_detections] representing sorted confidence scores for
          detected boxes. The values are between [0, 1].
        `detection_classes`: An `int` tf.Tensor of shape
          [batch, max_num_detections] representing classes for detected boxes.
        `num_detections`: An `int` tf.Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
        `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
        `decoded_box_scores`: A `float` tf.Tensor of shape
          [batch, num_raw_boxes] representing socres of all the decoded boxes.
    """
    box_scores = tf.nn.softmax(raw_scores, axis=-1)

    # Removes the background class.
    box_scores_shape = tf.shape(box_scores)
    box_scores_shape_list = box_scores.get_shape().as_list()
    batch_size = box_scores_shape[0]
    num_locations = box_scores_shape_list[1]
    num_classes = box_scores_shape_list[-1]

    box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1])

    if bbox_per_class:
      num_detections = num_locations * (num_classes - 1)
      raw_boxes = tf.reshape(raw_boxes,
                             [batch_size, num_locations, num_classes, 4])
      raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1])
      anchor_boxes = tf.tile(
          tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
      raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4])
      anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])

    # Box decoding.
    decoded_boxes = box_ops.decode_boxes(
        raw_boxes, anchor_boxes, weights=regression_weights)

    # Box clipping
    decoded_boxes = box_ops.clip_boxes(
        decoded_boxes, tf.expand_dims(image_shape, axis=1))

    if bbox_per_class:
      decoded_boxes = tf.reshape(
          decoded_boxes, [batch_size, num_locations, num_classes - 1, 4])
    else:
      decoded_boxes = tf.expand_dims(decoded_boxes, axis=2)

    if not self._config_dict['apply_nms']:
      return {
          'decoded_boxes': decoded_boxes,
          'decoded_box_scores': box_scores,
      }

    # Optionally force the NMS be run on CPU.
    if self._config_dict['use_cpu_nms']:
      nms_context = tf.device('cpu:0')
    else:
      nms_context = contextlib.nullcontext()

    with nms_context:
      if self._config_dict['nms_version'] == 'batched':
        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
            _generate_detections_batched(
                decoded_boxes, box_scores,
                self._config_dict['pre_nms_score_threshold'],
                self._config_dict['nms_iou_threshold'],
                self._config_dict['max_num_detections']))
      elif self._config_dict['nms_version'] == 'v1':
        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
            _generate_detections_v1(
                decoded_boxes,
                box_scores,
                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
                pre_nms_score_threshold=self
                ._config_dict['pre_nms_score_threshold'],
                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
                max_num_detections=self._config_dict['max_num_detections'],
                soft_nms_sigma=self._config_dict['soft_nms_sigma']))
      elif self._config_dict['nms_version'] == 'v2':
        (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
            _generate_detections_v2(
                decoded_boxes,
                box_scores,
                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
                pre_nms_score_threshold=self
                ._config_dict['pre_nms_score_threshold'],
                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
                max_num_detections=self._config_dict['max_num_detections']))
      else:
        raise ValueError('NMS version {} not supported.'.format(
            self._config_dict['nms_version']))

    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1

    return {
        'num_detections': valid_detections,
        'detection_boxes': nmsed_boxes,
        'detection_classes': nmsed_classes,
        'detection_scores': nmsed_scores,
    }
    def __call__(self, raw_boxes, raw_scores, anchor_boxes, image_shape):
        """Generate final detections.

    Args:
      raw_boxes: a dict with keys representing FPN levels and values
        representing box tenors of shape
        [batch, feature_h, feature_w, num_anchors * 4].
      raw_scores: a dict with keys representing FPN levels and values
        representing logit tensors of shape
        [batch, feature_h, feature_w, num_anchors].
      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
        corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: a tensor of shape of [batch_size, 2] storing the image height
        and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      If `apply_nms` = True, the return is a dictionary with keys:
        `detection_boxes`: float Tensor of shape [batch, max_num_detections, 4]
          representing top detected boxes in [y1, x1, y2, x2].
        `detection_scores`: float Tensor of shape [batch, max_num_detections]
          representing sorted confidence scores for detected boxes. The values
          are between [0, 1].
        `detection_classes`: int Tensor of shape [batch, max_num_detections]
          representing classes for detected boxes.
        `num_detections`: int Tensor of shape [batch] only the first
          `num_detections` boxes are valid detections
      If `apply_nms` = False, the return is a dictionary with keys:
        `decoded_boxes`: float Tensor of shape [batch, num_raw_boxes, 4]
          representing all the decoded boxes.
        `decoded_box_scores`: float Tensor of shape [batch, num_raw_boxes]
          representing socres of all the decoded boxes.
    """
        # Collects outputs from all levels into a list.
        boxes = []
        scores = []
        levels = list(raw_boxes.keys())
        min_level = min(levels)
        max_level = max(levels)
        for i in range(min_level, max_level + 1):
            raw_boxes_i_shape = tf.shape(raw_boxes[i])
            batch_size = raw_boxes_i_shape[0]
            num_anchors_per_locations = raw_boxes_i_shape[-1] // 4
            num_classes = tf.shape(
                raw_scores[i])[-1] // num_anchors_per_locations

            # Applies score transformation and remove the implicit background class.
            scores_i = tf.sigmoid(
                tf.reshape(raw_scores[i], [batch_size, -1, num_classes]))
            scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])

            # Box decoding.
            # The anchor boxes are shared for all data in a batch.
            # One stage detector only supports class agnostic box regression.
            anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
            raw_boxes_i = tf.reshape(raw_boxes[i], [batch_size, -1, 4])
            boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i)

            # Box clipping.
            boxes_i = box_ops.clip_boxes(boxes_i,
                                         tf.expand_dims(image_shape, axis=1))

            boxes.append(boxes_i)
            scores.append(scores_i)
        boxes = tf.concat(boxes, axis=1)
        boxes = tf.expand_dims(boxes, axis=2)
        scores = tf.concat(scores, axis=1)

        if not self._config_dict['apply_nms']:
            return {
                'decoded_boxes': boxes,
                'decoded_box_scores': scores,
            }

        if self._config_dict['use_batched_nms']:
            nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
                _generate_detections_batched(
                    boxes, scores,
                    self._config_dict['pre_nms_score_threshold'],
                    self._config_dict['nms_iou_threshold'],
                    self._config_dict['max_num_detections']))
        else:
            nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
                _generate_detections_v2(
                    boxes, scores, self._config_dict['pre_nms_top_k'],
                    self._config_dict['pre_nms_score_threshold'],
                    self._config_dict['nms_iou_threshold'],
                    self._config_dict['max_num_detections']))

        # Adds 1 to offset the background class which has index 0.
        nmsed_classes += 1

        return {
            'num_detections': valid_detections,
            'detection_boxes': nmsed_boxes,
            'detection_classes': nmsed_classes,
            'detection_scores': nmsed_scores,
        }