def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): """ Perform greedy non-maximum suppression on the input boxes. Greedy NMS works by selecting the box with the highest score and removing all boxes around it that are close to it measured by IoU similarity. Out of the boxes that are left over, once again the one with the highest score is selected and so on, until no boxes with to much overlap are left. """ boxes_left = np.copy(predictions) # Store the boxes that make it through the non-maximum suppression. maxima = [] while boxes_left.shape[0] > 0: # Get the index of the box with the highest confidence. maximum_index = np.argmax(boxes_left[:, 0]) maximum_box = np.copy(boxes_left[maximum_index]) maxima.append(maximum_box) boxes_left = np.delete(boxes_left, maximum_index, axis=0) if boxes_left.shape[0] == 0: break # Compare (IoU) the other left over boxes to the maximum box. similarities = iou(boxes_left[:, 1:], maximum_box[1:], coords=coords, mode='element_wise', border_pixels=border_pixels) # Remove the ones that overlap too much with the maximum box. boxes_left = boxes_left[similarities <= iou_threshold] return np.array(maxima)
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): """ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output box and is thus useful for debugging. """ boxes_left = np.copy(predictions) maxima = [ ] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[ 0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax( boxes_left[:, 1] ) # ...get the index of the next box with the highest confidence... maximum_box = np.copy( boxes_left[maximum_index]) # ...copy that box and... maxima.append( maximum_box ) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete( boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou( boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels ) # compare (IoU) the other left over boxes to the maximum box boxes_left = boxes_left[ similarities <= iou_threshold] # so that we can remove the ones that overlap too much with the maximum box return np.array(maxima)
def __call__(self, ground_truth_labels, diagnostics=False): """ Converts ground truth bounding box data into a suitable format to train an SSD model. Arguments: ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging to the respective image, and the data for each ground truth bounding box has the format `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be an integer greater than 0 for all boxes as class ID 0 is reserved for the background class. diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned, but also a copy of it with anchor box coordinates in place of the ground truth coordinates. This can be very useful if you want to visualize which anchor boxes got matched to which ground truth boxes. Returns: `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in the last axis are the box coordinates, the next four elements after that are just dummy elements, and the last four elements are the variances. """ # Mapping to define which indices represent which coordinates in the ground truth. class_id = 0 xmin = 1 ymin = 2 xmax = 3 ymax = 4 batch_size = len(ground_truth_labels) ################################################################################## # Generate the template for y_encoded. ################################################################################## y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False) ################################################################################## # Match ground truth boxes to anchor boxes. ################################################################################## # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have # a ground truth match and for which the maximal IoU overlap with any ground truth box is less # than or equal to `neg_iou_limit` will be a negative (background) box. y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default. n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors for i in range(batch_size): # For each batch item... if ground_truth_labels[ i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match. labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item # Check for degenerate ground truth bounding boxes before attempting any computations. if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(labels[:, [ymax]] - labels[:, [ymin]] <= 0): raise DegenerateBoxError( "SSDInputEncoder detected degenerate ground truth bounding boxes \ for batch item {} with bounding boxes {}, ".format( i, labels) + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " + "bounding boxes will lead to NaN errors during the training.") # Maybe normalize the box coordinates. if self.normalize_coords: labels[:, [ymin, ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height labels[:, [xmin, xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width # Maybe convert the box coordinate format. if self.coords == 'centroids': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels) elif self.coords == 'minmax': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax') classes_one_hot = class_vectors[labels[:, class_id].astype( np.int)] # The one-hot class IDs for the ground truth boxes of this batch item labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]], axis=-1) # The one-hot version of the labels for this batch item # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item. # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`. similarities = iou(labels[:, [xmin, ymin, xmax, ymax]], y_encoded[i, :, -12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels) # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU. # This ensures that each ground truth box will have at least one good match. # For each ground truth box, get the anchor box to match with it. bipartite_matches = match_bipartite_greedy(weight_matrix=similarities) # Write the ground truth data to the matched anchor boxes. y_encoded[i, bipartite_matches, :-8] = labels_one_hot # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, bipartite_matches] = 0 # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar # ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no # such ground truth box. if self.matching_type == 'multi': # Get all matches that satisfy the IoU threshold. matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold) # Write the ground truth data to the matched anchor boxes. y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]] # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, matches[1]] = 0 # Third: Now after the matching is done, all negative (background) anchor boxes that have # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral, # i.e. they will no longer be background boxes. These anchors are "too close" to a # ground truth box to be valid background boxes. max_background_similarities = np.amax(similarities, axis=0) neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0] y_encoded[i, neutral_boxes, self.background_id] = 0 ################################################################################## # Convert box coordinates to anchor box offsets. ################################################################################## if self.coords == 'centroids': # cx(gt) - cx(anchor), cy(gt) - cy(anchor) y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [-8, -7]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance y_encoded[:, :, [-12, -11]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [-4, -3]] # w(gt) / w(anchor), h(gt) / h(anchor) y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [-6, -5]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) y_encoded[:, :, [-10, -9]] = np.log(y_encoded[:, :, [-10, -9]]) / y_encoded[:, :, [-2, -1]] elif self.coords == 'corners': # (gt - anchor) for all four coordinates y_encoded[:, :, -12:-8] -= y_encoded[:, :, -8:-4] # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-12, -10]] /= np.expand_dims(y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, [-11, -9]] /= np.expand_dims(y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1) # (gt - anchor) / size(anchor) / variance for all four coordinates, # where 'size' refers to w and h respectively y_encoded[:, :, -12:-8] /= y_encoded[:, :, -4:] elif self.coords == 'minmax': # (gt - anchor) for all four coordinates y_encoded[:, :, -12:-8] -= y_encoded[:, :, -8:-4] # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-12, -11]] /= np.expand_dims(y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, [-10, -9]] /= np.expand_dims(y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1) # (gt - anchor) / size(anchor) / variance for all four coordinates, # where 'size' refers to w and h respectively y_encoded[:, :, -12:-8] /= y_encoded[:, :, -4:] if diagnostics: # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, # but keeping the anchor box coordinates). y_matched_anchors = np.copy(y_encoded) # Keeping the anchor box coordinates means setting the offsets to zero. y_matched_anchors[:, :, -12:-8] = 0 return y_encoded, y_matched_anchors else: return y_encoded
def __call__(self, labels, image_height=None, image_width=None): """ Arguments: labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where `m` is the number of bounding boxes and `n` is the number of element that defines each bounding box (box coordinates, class ID, etc.). The box coordinates are expected to be in the image's coordinate system. image_height: Only relevant if `check_overlap == True`. The height of the image (in pixels) to compare the box coordinates to. image_width: Only relevant if `check_overlap == True`. The width of the image (in pixels) to compare the box coordinates to. Returns: An array containing the labels of all boxes that are valid. """ labels = np.copy(labels) xmin = self.labels_format['xmin'] ymin = self.labels_format['ymin'] xmax = self.labels_format['xmax'] ymax = self.labels_format['ymax'] # Record the boxes that pass all checks here. requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool) if self.check_degenerate: non_degenerate = (labels[:, xmax] > labels[:, xmin]) * ( labels[:, ymax] > labels[:, ymin]) requirements_met *= non_degenerate if self.check_min_area: min_area_met = (labels[:, xmax] - labels[:, xmin]) * ( labels[:, ymax] - labels[:, ymin]) >= self.min_area requirements_met *= min_area_met if self.check_overlap: # Get the lower and upper bounds. if isinstance(self.overlap_bounds, BoundGenerator): lower, upper = self.overlap_bounds( ) # Call BoundGenerator's __call__ method. else: lower, upper = self.overlap_bounds # Compute which boxes are valid. if self.overlap_criterion == 'iou': # Compute the patch coordinates. image_coords = np.array([0, 0, image_width, image_height]) # Compute the IoU between the patch and all of the ground truth boxes. image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element_wise', border_pixels=self.border_pixels) requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper) elif self.overlap_criterion == 'area': if self.border_pixels == 'half': d = 0 elif self.border_pixels == 'include': # If border pixels are supposed to belong to the bounding boxes, # we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. d = 1 elif self.border_pixels == 'exclude': # If border pixels are not supposed to belong to the bounding boxes, # we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. d = -1 # Compute the area of the boxes. box_areas = (labels[:, xmax] - labels[:, xmin] + d) * (labels[:, ymax] - labels[:, ymin] + d) # Compute the intersection area between the patch and all of the ground truth boxes. clipped_boxes = np.copy(labels) clipped_boxes[:, [ymin, ymax]] = np.clip(labels[:, [ymin, ymax]], a_min=0, a_max=image_height - 1) clipped_boxes[:, [xmin, xmax]] = np.clip(labels[:, [xmin, xmax]], a_min=0, a_max=image_width - 1) intersection_areas = (clipped_boxes[:, xmax] - clipped_boxes[:, xmin] + d) * \ (clipped_boxes[:, ymax] - clipped_boxes[:, ymin] + d) # Check which boxes meet the overlap requirements. if lower == 0.0: # If `lower == 0`, we want to make sure that boxes with area 0 don't count, # hence the ">" sign instead of the ">=" sign. mask_lower = intersection_areas > 0.0 else: mask_lower = intersection_areas >= lower * box_areas mask_upper = intersection_areas <= upper * box_areas requirements_met *= mask_lower * mask_upper elif self.overlap_criterion == 'center_point': # Compute the center points of the boxes. cy = (labels[:, ymin] + labels[:, ymax]) / 2 cx = (labels[:, xmin] + labels[:, xmax]) / 2 image_box_center = (cy >= 0.0) * (cy <= image_height - 1) * ( cx >= 0.0) * (cx <= image_width - 1) requirements_met *= image_box_center return labels[requirements_met]
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'): """ Perform greedy non-maximum suppression on the input boxes. Greedy NMS works by selecting the box with the highest score and removing all boxes around it that are too close to it measured by IoU-similarity. Out of the boxes that are left over, once again the one with the highest score is selected and so on, until no boxes with too much overlap are left. Arguments: y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this is a list of length `n` where each list element is a 2D Numpy array. For a batch item with `k` predicted boxes this 2D Numpy array has shape `(k, 6)`, where each row contains the coordinates of the respective box in the format `[class_id, score, xmin, xmax, ymin, ymax]`. Technically, the number of columns doesn't have to be 6, it can be arbitrary as long as the first four elements of each row are `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element is the score assigned to the prediction. Note that this function is agnostic to the scale of the score or what it represents. iou_threshold (float, optional): All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions, where 'maximal' refers to the box score. coords (str, optional): The coordinate format of `y_pred_decoded`. Can be one of the formats supported by `iou()`. border_pixels (str, optional): How to treat the border pixels of the bounding boxes. Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong to the boxes. If 'exclude', the border pixels do not belong to the boxes. If 'half', then one of each of the two horizontal and vertical borders belong to the boxex, but not the other. Returns: The predictions after removing non-maxima. The format is the same as the input format. """ y_pred_decoded_nms = [] for batch_item in y_pred_decoded: # For the labels of each batch item... boxes_left = np.copy(batch_item) maxima = [ ] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[ 0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax( boxes_left[:, 1] ) # ...get the index of the next box with the highest confidence... maximum_box = np.copy( boxes_left[maximum_index]) # ...copy that box and... maxima.append( maximum_box ) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete( boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... # ...compare (IoU) the other left over boxes to the maximum box... similarities = iou(boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...so that we can remove the ones that overlap too much with the maximum box boxes_left = boxes_left[similarities <= iou_threshold] y_pred_decoded_nms.append(np.array(maxima)) return y_pred_decoded_nms