Exemple #1
0
def decode_detections_debug(pred,
                            confidence_thresh=0.01,
                            iou_threshold=0.45,
                            top_k=200,
                            input_coords='centroids',
                            normalize_coords=True,
                            img_h=None,
                            img_w=None,
                            variance_encoded_in_target=False,
                            border_pixels='half'):
    if normalize_coords and ((img_h is None) or (img_w is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_h == {}` and `img_w == {}`"
            .format(img_h, img_w))

# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    pred_decoded_raw = np.copy(
        pred[:, :, :-8]
    )  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        if variance_encoded_in_target:
            # Decode the predicted box center x and y coordinates.
            pred_decoded_raw[:, :, [-4, -3]] = pred_decoded_raw[:, :, [
                -4, -3
            ]] * pred[:, :, [-6, -5]] + pred[:, :, [-8, -7]]
            # Decode the predicted box width and heigt.
            pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                pred_decoded_raw[:, :, [-2, -1]]) * pred[:, :, [-6, -5]]
        else:
            # Decode the predicted box center x and y coordinates.
            pred_decoded_raw[:, :, [-4, -3]] = pred_decoded_raw[:, :, [
                -4, -3
            ]] * pred[:, :, [-6, -5]] * pred[:, :, [-4, -3]] + pred[:, :,
                                                                    [-8, -7]]
            # Decode the predicted box width and heigt.
            pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                pred_decoded_raw[:, :, [-2, -1]] *
                pred[:, :, [-2, -1]]) * pred[:, :, [-6, -5]]
        pred_decoded_raw = convert_coordinates(pred_decoded_raw,
                                               start_index=-4,
                                               conversion='centroids2corners')
    elif input_coords == 'minmax':
        pred_decoded_raw[:, :,
                         -4:] *= pred[:, :,
                                      -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(
            pred[:, :, -7] - pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(
            pred[:, :, -5] - pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        pred_decoded_raw[:, :,
                         -4:] += pred[:, :, -8:
                                      -4]  # delta(pred) + anchor == pred for all four coordinates
        pred_decoded_raw = convert_coordinates(pred_decoded_raw,
                                               start_index=-4,
                                               conversion='minmax2corners')
    elif input_coords == 'corners':
        pred_decoded_raw[:, :,
                         -4:] *= pred[:, :,
                                      -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(
            pred[:, :, -6] - pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(
            pred[:, :, -5] - pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        pred_decoded_raw[:, :,
                         -4:] += pred[:, :, -8:
                                      -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'."
        )

# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        pred_decoded_raw[:, :, [
            -4, -2
        ]] *= img_w  # Convert xmin, xmax back to absolute coordinates
        pred_decoded_raw[:, :, [
            -3, -1
        ]] *= img_h  # Convert ymin, ymax back to absolute coordinates

# 3: For each batch item, prepend each box's internal index to its coordinates.

    pred_decoded_raw2 = np.zeros(
        (pred_decoded_raw.shape[0], pred_decoded_raw.shape[1],
         pred_decoded_raw.shape[2] + 1))  # Expand the last axis by one.
    pred_decoded_raw2[:, :, 1:] = pred_decoded_raw
    pred_decoded_raw2[:, :, 0] = np.arange(
        pred_decoded_raw.shape[1]
    )  # Put the box indices as the first element for each box via broadcasting.
    pred_decoded_raw = pred_decoded_raw2

    # 4: Apply confidence thresholding and non-maximum suppression per class

    n_classes = pred_decoded_raw.shape[
        -1] - 5  # The number of classes is the length of the last axis minus the four box coordinates and minus the index

    pred_decoded = []  # Store the final predictions in this list
    for batch_item in pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(
                1, n_classes
        ):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [
                0, class_id + 1, -4, -3, -2, -1
            ]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
            threshold_met = single_class[
                single_class[:, 1] >
                confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms_debug(
                    threshold_met,
                    iou_threshold=iou_threshold,
                    coords='corners',
                    border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros(
                    (maxima.shape[0], maxima.shape[1] + 1)
                )  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,
                              0] = maxima[:,
                                          0]  # Write the box index to the first column...
                maxima_output[:,
                              1] = class_id  # ...and write the class ID to the second column...
                maxima_output[:,
                              2:] = maxima[:,
                                           1:]  # ...and write the rest of the maxima data to the other columns...
                pred.append(
                    maxima_output
                )  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        pred = np.concatenate(pred, axis=0)
        if pred.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
            top_k_indices = np.argpartition(
                pred[:, 2], kth=pred.shape[0] - top_k, axis=0
            )[pred.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-score maxima...
            pred = pred[
                top_k_indices]  # ...and keep only those entries of `pred`...
        pred_decoded.append(
            pred
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return pred_decoded
Exemple #2
0
def decode_detections_fast(pred,
                           confidence_thresh=0.5,
                           iou_threshold=0.45,
                           top_k='all',
                           input_coords='centroids',
                           normalize_coords=True,
                           img_h=None,
                           img_w=None,
                           border_pixels='half'):
    if normalize_coords and ((img_h is None) or (img_w is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_h == {}` and `img_w == {}`"
            .format(img_h, img_w))

    # 1: Convert the classes from one-hot encoding to their class ID
    pred_converted = np.copy(
        pred[:, :, -14:-8]
    )  # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
    pred_converted[:, :, 0] = np.argmax(
        pred[:, :, :-12], axis=-1
    )  # The indices of the highest confidence values in the one-hot class vectors are the class ID
    pred_converted[:, :, 1] = np.amax(
        pred[:, :, :-12],
        axis=-1)  # Store the confidence values themselves, too

    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
    if input_coords == 'centroids':
        pred_converted[:, :, [4, 5]] = np.exp(
            pred_converted[:, :, [4, 5]] * pred[:, :, [-2, -1]]
        )  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        pred_converted[:, :, [4, 5]] *= pred[:, :, [
            -6, -5
        ]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        pred_converted[:, :, [2, 3]] *= pred[:, :, [-4, -3]] * pred[:, :, [
            -6, -5
        ]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        pred_converted[:, :, [2, 3]] += pred[:, :, [
            -8, -7
        ]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        pred_converted = convert_coordinates(pred_converted,
                                             start_index=-4,
                                             conversion='centroids2corners')
    elif input_coords == 'minmax':
        pred_converted[:, :,
                       2:] *= pred[:, :,
                                   -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        pred_converted[:, :, [2, 3]] *= np.expand_dims(
            pred[:, :, -7] - pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        pred_converted[:, :, [4, 5]] *= np.expand_dims(
            pred[:, :, -5] - pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        pred_converted[:, :,
                       2:] += pred[:, :, -8:
                                   -4]  # delta(pred) + anchor == pred for all four coordinates
        pred_converted = convert_coordinates(pred_converted,
                                             start_index=-4,
                                             conversion='minmax2corners')
    elif input_coords == 'corners':
        pred_converted[:, :,
                       2:] *= pred[:, :,
                                   -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        pred_converted[:, :, [2, 4]] *= np.expand_dims(
            pred[:, :, -6] - pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        pred_converted[:, :, [3, 5]] *= np.expand_dims(
            pred[:, :, -5] - pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        pred_converted[:, :,
                       2:] += pred[:, :, -8:
                                   -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'."
        )

    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
    if normalize_coords:
        pred_converted[:, :, [
            2, 4
        ]] *= img_w  # Convert xmin, xmax back to absolute coordinates
        pred_converted[:, :, [
            3, 5
        ]] *= img_h  # Convert ymin, ymax back to absolute coordinates

    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
    pred_decoded = []
    for batch_item in pred_converted:  # For each image in the batch...
        boxes = batch_item[np.nonzero(
            batch_item[:, 0]
        )]  # ...get all boxes that don't belong to the background class,...
        boxes = boxes[
            boxes[:, 1] >=
            confidence_thresh]  # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
        if iou_threshold:  # ...if an IoU threshold is set...
            boxes = _greedy_nms2(boxes,
                                 iou_threshold=iou_threshold,
                                 coords='corners',
                                 border_pixels=border_pixels
                                 )  # ...perform NMS on the remaining boxes.
        if top_k != 'all' and boxes.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point...
            top_k_indices = np.argpartition(
                boxes[:, 1], kth=boxes.shape[0] - top_k, axis=0
            )[boxes.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-scoring boxes...
            boxes = boxes[top_k_indices]  # ...and keep only those boxes...
        pred_decoded.append(
            boxes
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return pred_decoded
Exemple #3
0
    def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps=None,
                                        this_offsets=None,
                                        diagnostics=False):
        size = min(self.img_h, self.img_w)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_anchor_box:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(
                        this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (this_steps is None):
            step_height = self.img_h / feature_map_size[0]
            step_width = self.img_w / feature_map_size[1]
        else:
            if isinstance(this_steps,
                          (list, tuple)) and (len(this_steps) == 2):
                step_height = this_steps[0]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets,
                          (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_size[0] - 1) *
                         step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_size[1] - 1) * step_width,
                         feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(
            cx_grid, -1
        )  # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(
            cy_grid, -1
        )  # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes))  # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes))  # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_w] = self.img_w - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_h] = self.img_h - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_w
            boxes_tensor[:, :, :, [1, 3]] /= self.img_h

        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        if diagnostics:
            return boxes_tensor, (cy,
                                  cx), wh_list, (step_height,
                                                 step_width), (offset_height,
                                                               offset_width)
        else:
            return boxes_tensor
Exemple #4
0
    def call(self, x, mask=None):
        size = min(self.img_h, self.img_w)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_anchor_box:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        # We need the shape of the input tensor
        if K.image_dim_ordering() == 'tf':
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
        else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (self.this_steps is None):
            step_height = self.img_h / feature_map_height
            step_width = self.img_w / feature_map_width
        else:
            if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:,:,:,[0, 2]]
            x_coords[x_coords >= self.img_w] = self.img_w - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:,:,:,[0, 2]] = x_coords
            y_coords = boxes_tensor[:,:,:,[1, 3]]
            y_coords[y_coords >= self.img_h] = self.img_h - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:,:,:,[1, 3]] = y_coords

        # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_w
            boxes_tensor[:, :, :, [1, 3]] /= self.img_h

        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')

        # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
        # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
        variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances # Long live broadcasting
        # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)

        # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
        # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
Exemple #5
0
    def __call__(self, ground_truth_labels, diagnostics=False):
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)
        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)
        y_encoded[:, :, self.
                  background_id] = 1  # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[
            1]  # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(
            self.n_classes
        )  # An identity matrix that we'll use as one-hot class vectors
        for i in range(batch_size):  # For each batch item...

            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:, [
                    ymin, ymax
                ]] /= self.img_h  # Normalize ymin and ymax relative to the image height
                labels[:, [
                    xmin, xmax
                ]] /= self.img_w  # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:
                              -8] = 0  # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded