Esempio n. 1
0
def _greedy_nms_debug(predictions,
                      iou_threshold=0.45,
                      coords='corners',
                      border_pixels='half'):
    """
    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
    function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
    left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
    box and is thus useful for debugging.
    """
    boxes_left = np.copy(predictions)
    maxima = [
    ]  # This is where we store the boxes that make it through the non-maximum suppression
    while boxes_left.shape[
            0] > 0:  # While there are still boxes left to compare...
        maximum_index = np.argmax(
            boxes_left[:, 1]
        )  # ...get the index of the next box with the highest confidence...
        maximum_box = np.copy(
            boxes_left[maximum_index])  # ...copy that box and...
        maxima.append(
            maximum_box
        )  # ...append it to `maxima` because we'll definitely keep it
        boxes_left = np.delete(
            boxes_left, maximum_index,
            axis=0)  # Now remove the maximum box from `boxes_left`
        if boxes_left.shape[0] == 0:
            break  # If there are no boxes left after this step, break. Otherwise...
        similarities = iou(
            boxes_left[:, 2:],
            maximum_box[2:],
            coords=coords,
            mode='element-wise',
            border_pixels=border_pixels
        )  # ...compare (IoU) the other left over boxes to the maximum box...
        boxes_left = boxes_left[
            similarities <=
            iou_threshold]  # ...so that we can remove the ones that overlap too much with the maximum box
    return np.array(maxima)
    def __call__(self, ground_truth_labels, diagnostics=False):
        """
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        """

        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.

        y_encoded[:, :, self.
                  background_id] = 1  # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[
            1]  # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(
            self.n_classes
        )  # An identity matrix that we'll use as one-hot class vectors

        for i in range(batch_size):  # For each batch item...

            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:, [
                    ymin, ymax
                ]] /= self.img_height  # Normalize ymin and ymax relative to the image height
                labels[:, [
                    xmin, xmax
                ]] /= self.img_width  # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:
                              -8] = 0  # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
Esempio n. 3
0
def greedy_nms(y_pred_decoded,
               iou_threshold=0.45,
               coords='corners',
               border_pixels='half'):
    """
    Perform greedy non-maximum suppression on the input boxes.

    Greedy NMS works by selecting the box with the highest score and
    removing all boxes around it that are too close to it measured by IoU-similarity.
    Out of the boxes that are left over, once again the one with the highest
    score is selected and so on, until no boxes with too much overlap are left.

    Arguments:
        y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
            is a list of length `n` where each list element is a 2D Numpy array.
            For a batch item with `k` predicted boxes this 2D Numpy array has
            shape `(k, 6)`, where each row contains the coordinates of the respective
            box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
            Technically, the number of columns doesn't have to be 6, it can be
            arbitrary as long as the first four elements of each row are
            `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
            is the score assigned to the prediction. Note that this function is
            agnostic to the scale of the score or what it represents.
        iou_threshold (float, optional): All boxes with a Jaccard similarity of
            greater than `iou_threshold` with a locally maximal box will be removed
            from the set of predictions, where 'maximal' refers to the box score.
        coords (str, optional): The coordinate format of `y_pred_decoded`.
            Can be one of the formats supported by `iou()`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        The predictions after removing non-maxima. The format is the same as the input format.
    """
    y_pred_decoded_nms = []
    for batch_item in y_pred_decoded:  # For the labels of each batch item...
        boxes_left = np.copy(batch_item)
        maxima = [
        ]  # This is where we store the boxes that make it through the non-maximum suppression
        while boxes_left.shape[
                0] > 0:  # While there are still boxes left to compare...
            maximum_index = np.argmax(
                boxes_left[:, 1]
            )  # ...get the index of the next box with the highest confidence...
            maximum_box = np.copy(
                boxes_left[maximum_index])  # ...copy that box and...
            maxima.append(
                maximum_box
            )  # ...append it to `maxima` because we'll definitely keep it
            boxes_left = np.delete(
                boxes_left, maximum_index,
                axis=0)  # Now remove the maximum box from `boxes_left`
            if boxes_left.shape[0] == 0:
                break  # If there are no boxes left after this step, break. Otherwise...
            similarities = iou(
                boxes_left[:, 2:],
                maximum_box[2:],
                coords=coords,
                mode='element-wise',
                border_pixels=border_pixels
            )  # ...compare (IoU) the other left over boxes to the maximum box...
            boxes_left = boxes_left[
                similarities <=
                iou_threshold]  # ...so that we can remove the ones that overlap too much with the maximum box
        y_pred_decoded_nms.append(np.array(maxima))

    return y_pred_decoded_nms
    def __call__(self, labels, image_height=None, image_width=None):
        """
        Arguments:
            labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where
                `m` is the number of bounding boxes and `n` is the number of elements that defines
                each bounding box (box coordinates, class ID, etc.). The box coordinates are expected
                to be in the image's coordinate system.
            image_height (int): Only relevant if `check_overlap == True`. The height of the image
                (in pixels) to compare the box coordinates to.
            image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare
                the box coordinates to.

        Returns:
            An array containing the labels of all boxes that are valid.
        """

        labels = np.copy(labels)

        xmin = self.labels_format['xmin']
        ymin = self.labels_format['ymin']
        xmax = self.labels_format['xmax']
        ymax = self.labels_format['ymax']

        # Record the boxes that pass all checks here.
        requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool)

        if self.check_degenerate:

            non_degenerate = (labels[:, xmax] > labels[:, xmin]) * (
                labels[:, ymax] > labels[:, ymin])
            requirements_met *= non_degenerate

        if self.check_min_area:

            min_area_met = (labels[:, xmax] - labels[:, xmin]) * (
                labels[:, ymax] - labels[:, ymin]) >= self.min_area
            requirements_met *= min_area_met

        if self.check_overlap:

            # Get the lower and upper bounds.
            if isinstance(self.overlap_bounds, BoundGenerator):
                lower, upper = self.overlap_bounds()
            else:
                lower, upper = self.overlap_bounds

            # Compute which boxes are valid.

            if self.overlap_criterion == 'iou':
                # Compute the patch coordinates.
                image_coords = np.array([0, 0, image_width, image_height])
                # Compute the IoU between the patch and all of the ground truth boxes.
                image_boxes_iou = iou(image_coords,
                                      labels[:, [xmin, ymin, xmax, ymax]],
                                      coords='corners',
                                      mode='element-wise',
                                      border_pixels=self.border_pixels)
                requirements_met *= (image_boxes_iou >
                                     lower) * (image_boxes_iou <= upper)

            elif self.overlap_criterion == 'area':
                if self.border_pixels == 'half':
                    d = 0
                elif self.border_pixels == 'include':
                    d = 1  # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
                elif self.border_pixels == 'exclude':
                    d = -1  # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
                # Compute the areas of the boxes.
                box_areas = (labels[:, xmax] - labels[:, xmin] +
                             d) * (labels[:, ymax] - labels[:, ymin] + d)
                # Compute the intersection area between the patch and all of the ground truth boxes.
                clipped_boxes = np.copy(labels)
                clipped_boxes[:,
                              [ymin, ymax]] = np.clip(labels[:, [ymin, ymax]],
                                                      a_min=0,
                                                      a_max=image_height - 1)
                clipped_boxes[:, [xmin, xmax]] = np.clip(labels[:,
                                                                [xmin, xmax]],
                                                         a_min=0,
                                                         a_max=image_width - 1)
                intersection_areas = (
                    clipped_boxes[:, xmax] - clipped_boxes[:, xmin] + d) * (
                        clipped_boxes[:, ymax] - clipped_boxes[:, ymin] + d
                    )  # +1 because the border pixels belong to the box areas.
                # Check which boxes meet the overlap requirements.
                if lower == 0.0:
                    mask_lower = intersection_areas > lower * box_areas  # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign.
                else:
                    mask_lower = intersection_areas >= lower * box_areas  # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all.
                mask_upper = intersection_areas <= upper * box_areas
                requirements_met *= mask_lower * mask_upper

            elif self.overlap_criterion == 'center_point':
                # Compute the center points of the boxes.
                cy = (labels[:, ymin] + labels[:, ymax]) / 2
                cx = (labels[:, xmin] + labels[:, xmax]) / 2
                # Check which of the boxes have center points within the cropped patch remove those that don't.
                requirements_met *= (cy >= 0.0) * (cy <= image_height - 1) * (
                    cx >= 0.0) * (cx <= image_width - 1)

        return labels[requirements_met]