def __init__(self, num_classes=80, confidence_threshold=0.05, nms_iou_threshold=0.5, max_detections_per_class=100, max_detections=100, box_variance=[0.1, 0.1, 0.2, 0.2], **kwargs): super(DecodePredictions, self).__init__(**kwargs) self.num_classes = num_classes self.confidence_threshold = confidence_threshold self.nms_iou_threshold = nms_iou_threshold self.max_detections_per_class = max_detections_per_class self.max_detections = max_detections self._anchor_box = AnchorBox() self._box_variance = tf.convert_to_tensor([0.1, 0.1, 0.2, 0.2], dtype=tf.float32)
def build_decoding_layer( num_classes=2, confidence_threshold=0.05, nms_iou_threshold=0.5, max_detections_per_class=100, max_detections=100, box_variance=[0.1, 0.1, 0.2, 0.2], ): """Decodes predictions of the RetinaNet model. Args: num_classes: Number of classes in the dataset confidence_threshold: Minimum class probability, below which detections are pruned. nms_iou_threshold: IOU threshold for the NMS operation max_detections_per_class: Maximum number of detections to retain per class. max_detections: Maximum number of detections to retain across all classes. box_variance: The scaling factors used to scale the bounding box predictions. """ _anchor_box = AnchorBox() _box_variance = tf.convert_to_tensor([0.1, 0.1, 0.2, 0.2], dtype=tf.float32) def _decode_box_predictions(anchor_boxes, box_predictions): boxes = box_predictions * _box_variance boxes = tf.concat([ boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2], tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:] ], axis=-1) boxes_transformed = convert_to_corners(boxes) return boxes_transformed def forward(images, predictions): image_shape = tf.cast(tf.shape(images), dtype=tf.float32) anchor_boxes = _anchor_box.get_anchors(image_shape[1], image_shape[2]) box_predictions = predictions[:, :, :4] cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:]) boxes = _decode_box_predictions(anchor_boxes[None, ...], box_predictions) return tf.image.combined_non_max_suppression(tf.expand_dims(boxes, axis=2), cls_predictions, max_detections_per_class, max_detections, nms_iou_threshold, confidence_threshold, clip_boxes=False) return forward
def __init__(self): self._anchor_box = AnchorBox() self._box_variance = tf.convert_to_tensor([0.1, 0.1, 0.2, 0.2], dtype=tf.float32)
class LabelEncoder: """Transforms the raw labels into targets for training. This class has operations to generate targets for a batch of samples which is made up of the input images, bounding boxes for the objects present and their class ids. Attributes: anchor_box: Anchor box generator to encode the bounding boxes. box_variance: The scaling factors used to scale the bounding box targets. """ def __init__(self): self._anchor_box = AnchorBox() self._box_variance = tf.convert_to_tensor([0.1, 0.1, 0.2, 0.2], dtype=tf.float32) def _match_anchor_boxes(self, anchor_boxes, gt_boxes, match_iou=0.5, ignore_iou=0.4): """Matches ground truth boxes to anchor boxes based on IOU. 1. Calculates the pairwise IOU for the M `anchor_boxes` and N `gt_boxes` to get a `(M, N)` shaped matrix. 2. The ground truth box with the maximum IOU in each row is assigned to the anchor box provided the IOU is greater than `match_iou`. 3. If the maximum IOU in a row is less than `ignore_iou`, the anchor box is assigned with the background class. 4. The remaining anchor boxes that do not have any class assigned are ignored during training. Arguments: anchor_boxes: A float tensor with the shape `(total_anchors, 4)` representing all the anchor boxes for a given input image shape, where each anchor box is of the format `[x, y, width, height]`. gt_boxes: A float tensor with shape `(num_objects, 4)` representing the ground truth boxes, where each box is of the format `[x, y, width, height]`. match_iou: A float value representing the minimum IOU threshold for determining if a ground truth box can be assigned to an anchor box. ignore_iou: A float value representing the IOU threshold under which an anchor box is assigned to the background class. Returns: matched_gt_idx: Index of the matched object positive_mask: A mask for anchor boxes that have been assigned ground truth boxes. ignore_mask: A mask for anchor boxes that need to by ignored during training """ iou_matrix = compute_iou(anchor_boxes, gt_boxes) max_iou = tf.reduce_max(iou_matrix, axis=1) matched_gt_idx = tf.argmax(iou_matrix, axis=1) positive_mask = tf.greater_equal(max_iou, match_iou) negative_mask = tf.less(max_iou, ignore_iou) ignore_mask = tf.logical_not( tf.logical_or(positive_mask, negative_mask)) return (matched_gt_idx, tf.cast(positive_mask, dtype=tf.float32), tf.cast(ignore_mask, dtype=tf.float32)) def _compute_box_target(self, anchor_boxes, matched_gt_boxes): """Transforms the ground truth boxes into targets for training""" box_target = tf.concat([ (matched_gt_boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:], tf.math.log(matched_gt_boxes[:, 2:] / anchor_boxes[:, 2:]), ], axis=-1) box_target = box_target / self._box_variance return box_target def _encode_sample(self, image_shape, gt_boxes, cls_ids): """Creates box and classification targets for a single sample""" anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2]) cls_ids = tf.cast(cls_ids, dtype=tf.float32) matched_gt_idx, positive_mask, ignore_mask = self._match_anchor_boxes( anchor_boxes, gt_boxes) matched_gt_boxes = tf.gather(gt_boxes, matched_gt_idx) box_target = self._compute_box_target(anchor_boxes, matched_gt_boxes) matched_gt_cls_ids = tf.gather(cls_ids, matched_gt_idx) cls_target = tf.where(tf.not_equal(positive_mask, 1.0), -1.0, matched_gt_cls_ids) cls_target = tf.where(tf.equal(ignore_mask, 1.0), -2.0, cls_target) cls_target = tf.expand_dims(cls_target, axis=-1) label = tf.concat([box_target, cls_target], axis=-1) return label def encode_batch(self, batch_images, gt_boxes, cls_ids): """Creates box and classification targets for a batch""" images_shape = tf.shape(batch_images) batch_size = images_shape[0] labels = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True) for i in range(batch_size): label = self._encode_sample(images_shape, gt_boxes[i], cls_ids[i]) labels = labels.write(i, label) batch_images = tf.keras.applications.resnet.preprocess_input( batch_images) return batch_images, labels.stack()
class DecodePredictions(tf.keras.layers.Layer): """A Keras layer that decodes predictions of the RetinaNet model. Attributes: num_classes: Number of classes in the dataset confidence_threshold: Minimum class probability, below which detections are pruned. nms_iou_threshold: IOU threshold for the NMS operation max_detections_per_class: Maximum number of detections to retain per class. max_detections: Maximum number of detections to retain across all classes. box_variance: The scaling factors used to scale the bounding box predictions. """ def __init__(self, num_classes=80, confidence_threshold=0.05, nms_iou_threshold=0.5, max_detections_per_class=100, max_detections=100, box_variance=[0.1, 0.1, 0.2, 0.2], **kwargs): super(DecodePredictions, self).__init__(**kwargs) self.num_classes = num_classes self.confidence_threshold = confidence_threshold self.nms_iou_threshold = nms_iou_threshold self.max_detections_per_class = max_detections_per_class self.max_detections = max_detections self._anchor_box = AnchorBox() self._box_variance = tf.convert_to_tensor([0.1, 0.1, 0.2, 0.2], dtype=tf.float32) def _decode_box_predictions(self, anchor_boxes, box_predictions): boxes = box_predictions * self._box_variance boxes = tf.concat( [ boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2], tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:], ], axis=-1, ) boxes_transformed = convert_to_corners(boxes) return boxes_transformed def call(self, images, predictions): image_shape = tf.cast(tf.shape(images), dtype=tf.float32) anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2]) box_predictions = predictions[:, :, :4] cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:]) boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions) return tf.image.combined_non_max_suppression( tf.expand_dims(boxes, axis=2), cls_predictions, self.max_detections_per_class, self.max_detections, self.nms_iou_threshold, self.confidence_threshold, clip_boxes=False, )