def decode_detections_debug(pred, confidence_thresh=0.01, iou_threshold=0.45, top_k=200, input_coords='centroids', normalize_coords=True, img_h=None, img_w=None, variance_encoded_in_target=False, border_pixels='half'): if normalize_coords and ((img_h is None) or (img_w is None)): raise ValueError( "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_h == {}` and `img_w == {}`" .format(img_h, img_w)) # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates pred_decoded_raw = np.copy( pred[:, :, :-8] ) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]` if input_coords == 'centroids': if variance_encoded_in_target: # Decode the predicted box center x and y coordinates. pred_decoded_raw[:, :, [-4, -3]] = pred_decoded_raw[:, :, [ -4, -3 ]] * pred[:, :, [-6, -5]] + pred[:, :, [-8, -7]] # Decode the predicted box width and heigt. pred_decoded_raw[:, :, [-2, -1]] = np.exp( pred_decoded_raw[:, :, [-2, -1]]) * pred[:, :, [-6, -5]] else: # Decode the predicted box center x and y coordinates. pred_decoded_raw[:, :, [-4, -3]] = pred_decoded_raw[:, :, [ -4, -3 ]] * pred[:, :, [-6, -5]] * pred[:, :, [-4, -3]] + pred[:, :, [-8, -7]] # Decode the predicted box width and heigt. pred_decoded_raw[:, :, [-2, -1]] = np.exp( pred_decoded_raw[:, :, [-2, -1]] * pred[:, :, [-2, -1]]) * pred[:, :, [-6, -5]] pred_decoded_raw = convert_coordinates(pred_decoded_raw, start_index=-4, conversion='centroids2corners') elif input_coords == 'minmax': pred_decoded_raw[:, :, -4:] *= pred[:, :, -4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims( pred[:, :, -7] - pred[:, :, -8], axis=-1 ) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims( pred[:, :, -5] - pred[:, :, -6], axis=-1 ) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) pred_decoded_raw[:, :, -4:] += pred[:, :, -8: -4] # delta(pred) + anchor == pred for all four coordinates pred_decoded_raw = convert_coordinates(pred_decoded_raw, start_index=-4, conversion='minmax2corners') elif input_coords == 'corners': pred_decoded_raw[:, :, -4:] *= pred[:, :, -4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims( pred[:, :, -6] - pred[:, :, -8], axis=-1 ) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims( pred[:, :, -5] - pred[:, :, -7], axis=-1 ) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) pred_decoded_raw[:, :, -4:] += pred[:, :, -8: -4] # delta(pred) + anchor == pred for all four coordinates else: raise ValueError( "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'." ) # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that if normalize_coords: pred_decoded_raw[:, :, [ -4, -2 ]] *= img_w # Convert xmin, xmax back to absolute coordinates pred_decoded_raw[:, :, [ -3, -1 ]] *= img_h # Convert ymin, ymax back to absolute coordinates # 3: For each batch item, prepend each box's internal index to its coordinates. pred_decoded_raw2 = np.zeros( (pred_decoded_raw.shape[0], pred_decoded_raw.shape[1], pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one. pred_decoded_raw2[:, :, 1:] = pred_decoded_raw pred_decoded_raw2[:, :, 0] = np.arange( pred_decoded_raw.shape[1] ) # Put the box indices as the first element for each box via broadcasting. pred_decoded_raw = pred_decoded_raw2 # 4: Apply confidence thresholding and non-maximum suppression per class n_classes = pred_decoded_raw.shape[ -1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index pred_decoded = [] # Store the final predictions in this list for batch_item in pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]` pred = [] # Store the final predictions for this batch item here for class_id in range( 1, n_classes ): # For each class except the background class (which has class ID 0)... single_class = batch_item[:, [ 0, class_id + 1, -4, -3, -2, -1 ]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and... threshold_met = single_class[ single_class[:, 1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold. if threshold_met.shape[0] > 0: # If any boxes made the threshold... maxima = _greedy_nms_debug( threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them. maxima_output = np.zeros( (maxima.shape[0], maxima.shape[1] + 1) ) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]` maxima_output[:, 0] = maxima[:, 0] # Write the box index to the first column... maxima_output[:, 1] = class_id # ...and write the class ID to the second column... maxima_output[:, 2:] = maxima[:, 1:] # ...and write the rest of the maxima data to the other columns... pred.append( maxima_output ) # ...and append the maxima for this class to the list of maxima for this batch item. # Once we're through with all classes, keep only the `top_k` maxima with the highest scores pred = np.concatenate(pred, axis=0) if pred.shape[ 0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,... top_k_indices = np.argpartition( pred[:, 2], kth=pred.shape[0] - top_k, axis=0 )[pred.shape[0] - top_k:] # ...get the indices of the `top_k` highest-score maxima... pred = pred[ top_k_indices] # ...and keep only those entries of `pred`... pred_decoded.append( pred ) # ...and now that we're done, append the array of final predictions for this batch item to the output list return pred_decoded
def decode_detections_fast(pred, confidence_thresh=0.5, iou_threshold=0.45, top_k='all', input_coords='centroids', normalize_coords=True, img_h=None, img_w=None, border_pixels='half'): if normalize_coords and ((img_h is None) or (img_w is None)): raise ValueError( "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_h == {}` and `img_w == {}`" .format(img_h, img_w)) # 1: Convert the classes from one-hot encoding to their class ID pred_converted = np.copy( pred[:, :, -14:-8] ) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step pred_converted[:, :, 0] = np.argmax( pred[:, :, :-12], axis=-1 ) # The indices of the highest confidence values in the one-hot class vectors are the class ID pred_converted[:, :, 1] = np.amax( pred[:, :, :-12], axis=-1) # Store the confidence values themselves, too # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates if input_coords == 'centroids': pred_converted[:, :, [4, 5]] = np.exp( pred_converted[:, :, [4, 5]] * pred[:, :, [-2, -1]] ) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor) pred_converted[:, :, [4, 5]] *= pred[:, :, [ -6, -5 ]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred) pred_converted[:, :, [2, 3]] *= pred[:, :, [-4, -3]] * pred[:, :, [ -6, -5 ]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred) pred_converted[:, :, [2, 3]] += pred[:, :, [ -8, -7 ]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred) pred_converted = convert_coordinates(pred_converted, start_index=-4, conversion='centroids2corners') elif input_coords == 'minmax': pred_converted[:, :, 2:] *= pred[:, :, -4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively pred_converted[:, :, [2, 3]] *= np.expand_dims( pred[:, :, -7] - pred[:, :, -8], axis=-1 ) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) pred_converted[:, :, [4, 5]] *= np.expand_dims( pred[:, :, -5] - pred[:, :, -6], axis=-1 ) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) pred_converted[:, :, 2:] += pred[:, :, -8: -4] # delta(pred) + anchor == pred for all four coordinates pred_converted = convert_coordinates(pred_converted, start_index=-4, conversion='minmax2corners') elif input_coords == 'corners': pred_converted[:, :, 2:] *= pred[:, :, -4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively pred_converted[:, :, [2, 4]] *= np.expand_dims( pred[:, :, -6] - pred[:, :, -8], axis=-1 ) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) pred_converted[:, :, [3, 5]] *= np.expand_dims( pred[:, :, -5] - pred[:, :, -7], axis=-1 ) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) pred_converted[:, :, 2:] += pred[:, :, -8: -4] # delta(pred) + anchor == pred for all four coordinates else: raise ValueError( "Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'." ) # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that if normalize_coords: pred_converted[:, :, [ 2, 4 ]] *= img_w # Convert xmin, xmax back to absolute coordinates pred_converted[:, :, [ 3, 5 ]] *= img_h # Convert ymin, ymax back to absolute coordinates # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions pred_decoded = [] for batch_item in pred_converted: # For each image in the batch... boxes = batch_item[np.nonzero( batch_item[:, 0] )] # ...get all boxes that don't belong to the background class,... boxes = boxes[ boxes[:, 1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that... if iou_threshold: # ...if an IoU threshold is set... boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels ) # ...perform NMS on the remaining boxes. if top_k != 'all' and boxes.shape[ 0] > top_k: # If we have more than `top_k` results left at this point... top_k_indices = np.argpartition( boxes[:, 1], kth=boxes.shape[0] - top_k, axis=0 )[boxes.shape[0] - top_k:] # ...get the indices of the `top_k` highest-scoring boxes... boxes = boxes[top_k_indices] # ...and keep only those boxes... pred_decoded.append( boxes ) # ...and now that we're done, append the array of final predictions for this batch item to the output list return pred_decoded
def generate_anchor_boxes_for_layer(self, feature_map_size, aspect_ratios, this_scale, next_scale, this_steps=None, this_offsets=None, diagnostics=False): size = min(self.img_h, self.img_w) # Compute the box widths and and heights for all aspect ratios wh_list = [] for ar in aspect_ratios: if (ar == 1): # Compute the regular anchor box for aspect ratio 1. box_height = box_width = this_scale * size wh_list.append((box_width, box_height)) if self.two_anchor_box: # Compute one slightly larger version using the geometric mean of this scale value and the next. box_height = box_width = np.sqrt( this_scale * next_scale) * size wh_list.append((box_width, box_height)) else: box_width = this_scale * size * np.sqrt(ar) box_height = this_scale * size / np.sqrt(ar) wh_list.append((box_width, box_height)) wh_list = np.array(wh_list) n_boxes = len(wh_list) # Compute the grid of box center points. They are identical for all aspect ratios. # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. if (this_steps is None): step_height = self.img_h / feature_map_size[0] step_width = self.img_w / feature_map_size[1] else: if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2): step_height = this_steps[0] step_width = this_steps[1] elif isinstance(this_steps, (int, float)): step_height = this_steps step_width = this_steps # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. if (this_offsets is None): offset_height = 0.5 offset_width = 0.5 else: if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2): offset_height = this_offsets[0] offset_width = this_offsets[1] elif isinstance(this_offsets, (int, float)): offset_height = this_offsets offset_width = this_offsets # Now that we have the offsets and step sizes, compute the grid of anchor box center points. cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0]) cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1]) cx_grid, cy_grid = np.meshgrid(cx, cy) cx_grid = np.expand_dims( cx_grid, -1 ) # This is necessary for np.tile() to do what we want further down cy_grid = np.expand_dims( cy_grid, -1 ) # This is necessary for np.tile() to do what we want further down # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` # where the last dimension will contain `(cx, cy, w, h)` boxes_tensor = np.zeros( (feature_map_size[0], feature_map_size[1], n_boxes, 4)) boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)` boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries if self.clip_boxes: x_coords = boxes_tensor[:, :, :, [0, 2]] x_coords[x_coords >= self.img_w] = self.img_w - 1 x_coords[x_coords < 0] = 0 boxes_tensor[:, :, :, [0, 2]] = x_coords y_coords = boxes_tensor[:, :, :, [1, 3]] y_coords[y_coords >= self.img_h] = self.img_h - 1 y_coords[y_coords < 0] = 0 boxes_tensor[:, :, :, [1, 3]] = y_coords # `normalize_coords` is enabled, normalize the coordinates to be within [0,1] if self.normalize_coords: boxes_tensor[:, :, :, [0, 2]] /= self.img_w boxes_tensor[:, :, :, [1, 3]] /= self.img_h # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. if self.coords == 'centroids': # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') elif self.coords == 'minmax': # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') if diagnostics: return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width) else: return boxes_tensor
def call(self, x, mask=None): size = min(self.img_h, self.img_w) # Compute the box widths and and heights for all aspect ratios wh_list = [] for ar in self.aspect_ratios: if (ar == 1): # Compute the regular anchor box for aspect ratio 1. box_height = box_width = self.this_scale * size wh_list.append((box_width, box_height)) if self.two_anchor_box: # Compute one slightly larger version using the geometric mean of this scale value and the next. box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size wh_list.append((box_width, box_height)) else: box_height = self.this_scale * size / np.sqrt(ar) box_width = self.this_scale * size * np.sqrt(ar) wh_list.append((box_width, box_height)) wh_list = np.array(wh_list) # We need the shape of the input tensor if K.image_dim_ordering() == 'tf': batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape # Compute the grid of box center points. They are identical for all aspect ratios. # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. if (self.this_steps is None): step_height = self.img_h / feature_map_height step_width = self.img_w / feature_map_width else: if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2): step_height = self.this_steps[0] step_width = self.this_steps[1] elif isinstance(self.this_steps, (int, float)): step_height = self.this_steps step_width = self.this_steps # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. if (self.this_offsets is None): offset_height = 0.5 offset_width = 0.5 else: if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2): offset_height = self.this_offsets[0] offset_width = self.this_offsets[1] elif isinstance(self.this_offsets, (int, float)): offset_height = self.this_offsets offset_width = self.this_offsets # Now that we have the offsets and step sizes, compute the grid of anchor box center points. cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height) cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width) cx_grid, cy_grid = np.meshgrid(cx, cy) cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` # where the last dimension will contain `(cx, cy, w, h)` boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4)) boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)` boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries if self.clip_boxes: x_coords = boxes_tensor[:,:,:,[0, 2]] x_coords[x_coords >= self.img_w] = self.img_w - 1 x_coords[x_coords < 0] = 0 boxes_tensor[:,:,:,[0, 2]] = x_coords y_coords = boxes_tensor[:,:,:,[1, 3]] y_coords[y_coords >= self.img_h] = self.img_h - 1 y_coords[y_coords < 0] = 0 boxes_tensor[:,:,:,[1, 3]] = y_coords # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1] if self.normalize_coords: boxes_tensor[:, :, :, [0, 2]] /= self.img_w boxes_tensor[:, :, :, [1, 3]] /= self.img_h # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. if self.coords == 'centroids': # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') elif self.coords == 'minmax': # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis. variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)` variances_tensor += self.variances # Long live broadcasting # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)` boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1) # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)` boxes_tensor = np.expand_dims(boxes_tensor, axis=0) boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1)) return boxes_tensor
def __call__(self, ground_truth_labels, diagnostics=False): class_id = 0 xmin = 1 ymin = 2 xmax = 3 ymax = 4 batch_size = len(ground_truth_labels) y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False) y_encoded[:, :, self. background_id] = 1 # All boxes are background boxes by default. n_boxes = y_encoded.shape[ 1] # The total number of boxes that the model predicts per batch item class_vectors = np.eye( self.n_classes ) # An identity matrix that we'll use as one-hot class vectors for i in range(batch_size): # For each batch item... if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match. labels = ground_truth_labels[i].astype( np.float) # The labels for this batch item # Check for degenerate ground truth bounding boxes before attempting any computations. if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any( labels[:, [ymax]] - labels[:, [ymin]] <= 0): raise DegenerateBoxError( "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, " .format(i, labels) + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " + "bounding boxes will lead to NaN errors during the training." ) # Maybe normalize the box coordinates. if self.normalize_coords: labels[:, [ ymin, ymax ]] /= self.img_h # Normalize ymin and ymax relative to the image height labels[:, [ xmin, xmax ]] /= self.img_w # Normalize xmin and xmax relative to the image width # Maybe convert the box coordinate format. if self.coords == 'centroids': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels) elif self.coords == 'minmax': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax') classes_one_hot = class_vectors[labels[:, class_id].astype( np.int )] # The one-hot class IDs for the ground truth boxes of this batch item labels_one_hot = np.concatenate( [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]], axis=-1 ) # The one-hot version of the labels for this batch item # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item. # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`. similarities = iou(labels[:, [xmin, ymin, xmax, ymax]], y_encoded[i, :, -12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels) # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU. # This ensures that each ground truth box will have at least one good match. # For each ground truth box, get the anchor box to match with it. bipartite_matches = match_bipartite_greedy( weight_matrix=similarities) # Write the ground truth data to the matched anchor boxes. y_encoded[i, bipartite_matches, :-8] = labels_one_hot # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, bipartite_matches] = 0 # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar # ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no # such ground truth box. if self.matching_type == 'multi': # Get all matches that satisfy the IoU threshold. matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold) # Write the ground truth data to the matched anchor boxes. y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]] # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, matches[1]] = 0 # Third: Now after the matching is done, all negative (background) anchor boxes that have # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral, # i.e. they will no longer be background boxes. These anchors are "too close" to a # ground truth box to be valid background boxes. max_background_similarities = np.amax(similarities, axis=0) neutral_boxes = np.nonzero( max_background_similarities >= self.neg_iou_limit)[0] y_encoded[i, neutral_boxes, self.background_id] = 0 ################################################################################## # Convert box coordinates to anchor box offsets. ################################################################################## if self.coords == 'centroids': y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [ -8, -7 ]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor) y_encoded[:, :, [ -12, -11 ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [ -4, -3 ]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [ -6, -5 ]] # w(gt) / w(anchor), h(gt) / h(anchor) y_encoded[:, :, [-10, -9]] = np.log( y_encoded[:, :, [-10, -9]] ) / y_encoded[:, :, [ -2, -1 ]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) elif self.coords == 'corners': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -10]] /= np.expand_dims( y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-11, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively elif self.coords == 'minmax': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -11]] /= np.expand_dims( y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-10, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively if diagnostics: # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates). y_matched_anchors = np.copy(y_encoded) y_matched_anchors[:, :, -12: -8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero. return y_encoded, y_matched_anchors else: return y_encoded