def call(self, inputs):
    box_outputs, class_outputs, anchor_boxes, image_shape = inputs
    # Collects outputs from all levels into a list.
    boxes = []
    scores = []
    for i in range(self._min_level, self._max_level + 1):
      batch_size = tf.shape(input=class_outputs[i])[0]

      # Applies score transformation and remove the implicit background class.
      scores_i = _apply_score_activation(
          class_outputs[i], self._num_classes, self._score_activation)

      # Box decoding.
      # The anchor boxes are shared for all data in a batch.
      # One stage detector only supports class agnostic box regression.
      anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
      box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
      boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)

      # Box clipping.
      boxes_i = box_utils.clip_boxes(boxes_i, image_shape)

      boxes.append(boxes_i)
      scores.append(scores_i)
    boxes = tf.concat(boxes, axis=1)
    scores = tf.concat(scores, axis=1)
    boxes = tf.expand_dims(boxes, axis=2)

    (nmsed_boxes, nmsed_scores, nmsed_classes,
     valid_detections) = self._generate_detections(
         tf.cast(boxes, tf.float32), tf.cast(scores, tf.float32))
    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1
    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
    def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
        # Collects outputs from all levels into a list.
        boxes = []
        scores = []
        for i in range(self._min_level, self._max_level + 1):
            box_outputs_i_shape = tf.shape(box_outputs[i])
            batch_size = box_outputs_i_shape[0]
            num_anchors_per_locations = box_outputs_i_shape[-1] // 4
            num_classes = tf.shape(
                class_outputs[i])[-1] // num_anchors_per_locations

            # Applies score transformation and remove the implicit background class.
            scores_i = tf.sigmoid(
                tf.reshape(class_outputs[i], [batch_size, -1, num_classes]))
            scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])

            # Box decoding.
            # The anchor boxes are shared for all data in a batch.
            # One stage detector only supports class agnostic box regression.
            anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
            box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
            boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)

            # Box clipping.
            boxes_i = box_utils.clip_boxes(boxes_i, image_shape)

            boxes.append(boxes_i)
            scores.append(scores_i)
        boxes = tf.concat(boxes, axis=1)
        scores = tf.concat(scores, axis=1)

        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
            self._generate_detections(tf.expand_dims(boxes, axis=2), scores))

        # Adds 1 to offset the background class which has index 0.
        nmsed_classes += 1
        return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
    def __call__(self,
                 box_outputs,
                 class_outputs,
                 anchor_boxes,
                 image_shape,
                 is_single_fg_score=False,
                 keep_nms=True):
        """Generate final detections for Object Localization Network (OLN).

    Args:
      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
        representing the class-specific box coordinates relative to anchors.
      class_outputs: a tensor of shape of [batch_size, K, num_classes]
        representing the class logits before applying score activiation.
      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
        corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: a tensor of shape of [batch_size, 2] storing the image height
        and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.
      is_single_fg_score: a Bool indicator of whether class_outputs includes the
        background scores concatenated or not. By default, class_outputs is a
        concatenation of both scores for the foreground and background. That is,
        scores_without_bg=False.
      keep_nms: a Bool indicator of whether to perform NMS or not.

    Returns:
      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
        representing top detected boxes in [y1, x1, y2, x2].
      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
        representing sorted confidence scores for detected boxes. The values are
        between [0, 1].
      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
        representing classes for detected boxes.
      valid_detections: `int` Tensor of shape [batch_size] only the top
        `valid_detections` boxes are valid detections.
    """
        if is_single_fg_score:
            # Concatenates dummy background scores.
            dummy_bg_scores = tf.zeros_like(class_outputs)
            class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
        else:
            class_outputs = tf.nn.softmax(class_outputs, axis=-1)

        # Removes the background class.
        class_outputs_shape = tf.shape(class_outputs)
        batch_size = class_outputs_shape[0]
        num_locations = class_outputs_shape[1]
        num_classes = class_outputs_shape[-1]
        num_detections = num_locations * (num_classes - 1)

        class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
        box_outputs = tf.reshape(
            box_outputs,
            tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
        box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
        anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2),
                               [1, 1, num_classes - 1, 1])
        box_outputs = tf.reshape(
            box_outputs, tf.stack([batch_size, num_detections, 4], axis=-1))
        anchor_boxes = tf.reshape(
            anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))

        # Box decoding. For RPN outputs, box_outputs are all zeros.
        decoded_boxes = box_utils.decode_boxes(box_outputs,
                                               anchor_boxes,
                                               weights=[10.0, 10.0, 5.0, 5.0])

        # Box clipping
        decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)

        decoded_boxes = tf.reshape(
            decoded_boxes,
            tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))

        if keep_nms:
            nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
                self._generate_detections(decoded_boxes, class_outputs))
            # Adds 1 to offset the background class which has index 0.
            nmsed_classes += 1
        else:
            nmsed_boxes = decoded_boxes[:, :, 0, :]
            nmsed_scores = class_outputs[:, :, 0]
            nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
            valid_detections = tf.cast(
                tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)

        return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
    def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
        """Generate final detections.

    Args:
      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
        representing the class-specific box coordinates relative to anchors.
      class_outputs: a tensor of shape of [batch_size, K, num_classes]
        representing the class logits before applying score activiation.
      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
        corresponding anchor boxes w.r.t `box_outputs`.
      image_shape: a tensor of shape of [batch_size, 2] storing the image height
        and width w.r.t. the scaled image, i.e. the same image space as
        `box_outputs` and `anchor_boxes`.

    Returns:
      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
        representing top detected boxes in [y1, x1, y2, x2].
      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
        representing sorted confidence scores for detected boxes. The values are
        between [0, 1].
      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
        representing classes for detected boxes.
      valid_detections: `int` Tensor of shape [batch_size] only the top
        `valid_detections` boxes are valid detections.
    """
        class_outputs = tf.nn.softmax(class_outputs, axis=-1)

        # Removes the background class.
        class_outputs_shape = tf.shape(class_outputs)
        batch_size = class_outputs_shape[0]
        num_locations = class_outputs_shape[1]
        num_classes = class_outputs_shape[-1]
        num_detections = num_locations * (num_classes - 1)

        class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
        box_outputs = tf.reshape(
            box_outputs,
            tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
        box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
        anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2),
                               [1, 1, num_classes - 1, 1])
        box_outputs = tf.reshape(
            box_outputs, tf.stack([batch_size, num_detections, 4], axis=-1))
        anchor_boxes = tf.reshape(
            anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))

        # Box decoding.
        decoded_boxes = box_utils.decode_boxes(box_outputs,
                                               anchor_boxes,
                                               weights=[10.0, 10.0, 5.0, 5.0])

        # Box clipping
        decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)

        decoded_boxes = tf.reshape(
            decoded_boxes,
            tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))

        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
            self._generate_detections(decoded_boxes, class_outputs))

        # Adds 1 to offset the background class which has index 0.
        nmsed_classes += 1

        return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
    def oln_multilevel_propose_rois(
        self,
        rpn_boxes,
        rpn_scores,
        anchor_boxes,
        image_shape,
        rpn_pre_nms_top_k=2000,
        rpn_post_nms_top_k=1000,
        rpn_nms_threshold=0.7,
        rpn_score_threshold=0.0,
        rpn_min_size_threshold=0.0,
        decode_boxes=True,
        clip_boxes=True,
        use_batched_nms=False,
        apply_sigmoid_to_score=True,
        is_box_lrtb=False,
        rpn_object_scores=None,
    ):
        """Proposes RoIs given a group of candidates from different FPN levels.

    The following describes the steps:
      1. For each individual level:
        a. Adjust scores for each level if specified by rpn_object_scores.
        b. Apply sigmoid transform if specified.
        c. Decode boxes (either of xyhw or left-right-top-bottom format) if
          specified.
        d. Clip boxes if specified.
        e. Filter small boxes and those fall outside image if specified.
        f. Apply pre-NMS filtering including pre-NMS top k and score
           thresholding.
        g. Apply NMS.
      2. Aggregate post-NMS boxes from each level.
      3. Apply an overall top k to generate the final selected RoIs.

    Args:
      rpn_boxes: a dict with keys representing FPN levels and values
        representing box tenors of shape [batch_size, feature_h, feature_w,
        num_anchors * 4].
      rpn_scores: a dict with keys representing FPN levels and values
        representing logit tensors of shape [batch_size, feature_h, feature_w,
        num_anchors].
      anchor_boxes: a dict with keys representing FPN levels and values
        representing anchor box tensors of shape [batch_size, feature_h,
        feature_w, num_anchors * 4].
      image_shape: a tensor of shape [batch_size, 2] where the last dimension
        are [height, width] of the scaled image.
      rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
        keep before applying NMS. Default: 2000.
      rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
        keep after applying NMS. Default: 1000.
      rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
        used for NMS. If 0.0, no NMS is applied. Default: 0.7.
      rpn_score_threshold: a float between 0 and 1 representing the minimal box
        score to keep before applying NMS. This is often used as a pre-filtering
        step for better performance. If 0, no filtering is applied. Default: 0.
      rpn_min_size_threshold: a float representing the minimal box size in each
        side (w.r.t. the scaled image) to keep before applying NMS. This is
        often used as a pre-filtering step for better performance. If 0, no
        filtering is applied. Default: 0.
      decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
        using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
        `anchor_boxes`. Default: True.
      clip_boxes: a boolean indicating whether boxes are first clipped to the
        scaled image size before appliying NMS. If False, no clipping is applied
        and `image_shape` is ignored. Default: True.
      use_batched_nms: a boolean indicating whether NMS is applied in batch
        using `tf.image.combined_non_max_suppression`. Currently only available
        in CPU/GPU. Default: False.
      apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
        `rpn_scores` before applying NMS. Default: True.
      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
        bottom) format.
      rpn_object_scores: a predicted objectness score (e.g., centerness). In
        OLN, we use object_scores=centerness as a replacement of the scores at
        each level. A dict with keys representing FPN levels and values
        representing logit tensors of shape [batch_size, feature_h, feature_w,
        num_anchors].

    Returns:
      selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
        representing the box coordinates of the selected proposals w.r.t. the
        scaled image.
      selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
      1],representing the scores of the selected proposals.
    """
        with tf.name_scope('multilevel_propose_rois'):
            rois = []
            roi_scores = []
            image_shape = tf.expand_dims(image_shape, axis=1)
            for level in sorted(rpn_scores.keys()):
                with tf.name_scope('level_%d' % level):
                    _, feature_h, feature_w, num_anchors_per_location = (
                        rpn_scores[level].get_shape().as_list())

                    num_boxes = feature_h * feature_w * num_anchors_per_location
                    this_level_scores = tf.reshape(rpn_scores[level],
                                                   [-1, num_boxes])
                    this_level_boxes = tf.reshape(rpn_boxes[level],
                                                  [-1, num_boxes, 4])
                    this_level_anchors = tf.cast(tf.reshape(
                        anchor_boxes[level], [-1, num_boxes, 4]),
                                                 dtype=this_level_scores.dtype)

                    if rpn_object_scores:
                        this_level_object_scores = rpn_object_scores[level]
                        this_level_object_scores = tf.reshape(
                            this_level_object_scores, [-1, num_boxes])
                        this_level_object_scores = tf.cast(
                            this_level_object_scores, this_level_scores.dtype)
                        this_level_scores = this_level_object_scores

                    if apply_sigmoid_to_score:
                        this_level_scores = tf.sigmoid(this_level_scores)

                    if decode_boxes:
                        if is_box_lrtb:  # Box in left-right-top-bottom format.
                            this_level_boxes = box_utils.decode_boxes_lrtb(
                                this_level_boxes, this_level_anchors)
                        else:  # Box in standard x-y-h-w format.
                            this_level_boxes = box_utils.decode_boxes(
                                this_level_boxes, this_level_anchors)

                    if clip_boxes:
                        this_level_boxes = box_utils.clip_boxes(
                            this_level_boxes, image_shape)

                    if rpn_min_size_threshold > 0.0:
                        this_level_boxes, this_level_scores = box_utils.filter_boxes(
                            this_level_boxes, this_level_scores, image_shape,
                            rpn_min_size_threshold)

                    this_level_pre_nms_top_k = min(num_boxes,
                                                   rpn_pre_nms_top_k)
                    this_level_post_nms_top_k = min(num_boxes,
                                                    rpn_post_nms_top_k)
                    if rpn_nms_threshold > 0.0:
                        if use_batched_nms:
                            this_level_rois, this_level_roi_scores, _, _ = (
                                tf.image.combined_non_max_suppression(
                                    tf.expand_dims(this_level_boxes, axis=2),
                                    tf.expand_dims(this_level_scores, axis=-1),
                                    max_output_size_per_class=
                                    this_level_pre_nms_top_k,
                                    max_total_size=this_level_post_nms_top_k,
                                    iou_threshold=rpn_nms_threshold,
                                    score_threshold=rpn_score_threshold,
                                    pad_per_class=False,
                                    clip_boxes=False))
                        else:
                            if rpn_score_threshold > 0.0:
                                this_level_boxes, this_level_scores = (
                                    box_utils.filter_boxes_by_scores(
                                        this_level_boxes, this_level_scores,
                                        rpn_score_threshold))
                            this_level_boxes, this_level_scores = box_utils.top_k_boxes(
                                this_level_boxes,
                                this_level_scores,
                                k=this_level_pre_nms_top_k)
                            this_level_roi_scores, this_level_rois = (
                                nms.sorted_non_max_suppression_padded(
                                    this_level_scores,
                                    this_level_boxes,
                                    max_output_size=this_level_post_nms_top_k,
                                    iou_threshold=rpn_nms_threshold))
                    else:
                        this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
                            this_level_rois,
                            this_level_scores,
                            k=this_level_post_nms_top_k)

                    rois.append(this_level_rois)
                    roi_scores.append(this_level_roi_scores)

            all_rois = tf.concat(rois, axis=1)
            all_roi_scores = tf.concat(roi_scores, axis=1)

            with tf.name_scope('top_k_rois'):
                _, num_valid_rois = all_roi_scores.get_shape().as_list()
                overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)

                selected_rois, selected_roi_scores = box_utils.top_k_boxes(
                    all_rois, all_roi_scores, k=overall_top_k)

            return selected_rois, selected_roi_scores