def _greedy_nms(predictions, iou_threshold=0.45, coords='corners'): ''' The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal function for per-class NMS in `decode_detections()`. ''' boxes_left = np.copy(predictions) maxima = [ ] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[ 0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax( boxes_left[:, 0] ) # ...get the index of the next box with the highest confidence... maximum_box = np.copy( boxes_left[maximum_index]) # ...copy that box and... maxima.append( maximum_box ) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete( boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou( boxes_left[:, 1:], maximum_box[1:], coords=coords, mode='element-wise' ) # ...compare (IoU) the other left over boxes to the maximum box... boxes_left = boxes_left[ similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima)
def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): boxes_left = np.copy(predictions) maxima = [ ] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[ 0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax( boxes_left[:, 1] ) # ...get the index of the next box with the highest confidence... maximum_box = np.copy( boxes_left[maximum_index]) # ...copy that box and... maxima.append( maximum_box ) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete( boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou( boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels ) # ...compare (IoU) the other left over boxes to the maximum box... boxes_left = boxes_left[ similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima)
def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): """ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal function in `decode_detections_fast()`. """ boxes_left = np.copy(predictions) # This is where we store the boxes that make it through the non-maximum suppression maxima = [] # While there are still boxes left to compare... while boxes_left.shape[0] > 0: # get the index of the next box with the highest confidence maximum_index = np.argmax(boxes_left[:, 1]) # copy that box maximum_box = np.copy(boxes_left[maximum_index]) # append it to `maxima` because we'll definitely keep it maxima.append(maximum_box) # Now remove the maximum box from `boxes_left` boxes_left = np.delete(boxes_left, maximum_index, axis=0) # If there are no boxes left after this step, break. Otherwise... if boxes_left.shape[0] == 0: break # compare (IoU) the other left over boxes to the maximum box similarities = iou(boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # so that we can remove the ones that overlap too much with the maximum box boxes_left = boxes_left[similarities <= iou_threshold] return np.array(maxima)
def _greedy_nms(predictions, iou_threshold=0.45): ''' Perform greedy non-maximum suppression on the input boxes. Greedy NMS works by selecting the box with the highest score and removing all boxes around it that are too close to it measured by IoU-similarity. Out of the boxes that are left over, once again the one with the highest score is selected and so on, until no boxes with too much overlap are left. Arguments: predictions (list): A batch of decoded predictions. For a given batch size `n` this is a list of length `n` where each list element is a 2D Numpy array. For a batch item with `k` predicted boxes this 2D Numpy array has shape `(k, 7)`, where each row contains the coordinates of the respective box in the format `[class_id, score, cx, cy, w, h, angle]`. Technically, the number of columns doesn't have to be 7, it can be arbitrary as long as the first five elements of each row are `cx`, `cy`, `w`, `h`, angle (in this order) and the last element is the score assigned to the prediction. Note that this function is agnostic to the scale of the score or what it represents. iou_threshold (float, optional): All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions, where 'maximal' refers to the box score. Returns: The predictions after removing non-maxima. The format is the same as the input format. ''' boxes_left = np.copy(predictions) maxima = [ ] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[ 0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax( boxes_left[:, 0] ) # ...get the index of the next box with the highest confidence... maximum_box = np.copy( boxes_left[maximum_index]) # ...copy that box and... maxima.append( maximum_box ) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete( boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou( boxes_left[:, 1:], maximum_box[1:] ) # ...compare (IoU) the other left over boxes to the maximum box... similarities = np.reshape(similarities, (-1, )) boxes_left = boxes_left[ similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima)
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'): ''' Perform greedy non-maximum suppression on the input boxes. Greedy NMS works by selecting the box with the highest score and removing all boxes around it that are too close to it measured by IoU-similarity. Out of the boxes that are left over, once again the one with the highest score is selected and so on, until no boxes with too much overlap are left. Arguments: y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this is a list of length `n` where each list element is a 2D Numpy array. For a batch item with `k` predicted boxes this 2D Numpy array has shape `(k, 6)`, where each row contains the coordinates of the respective box in the format `[class_id, score, xmin, xmax, ymin, ymax]`. Technically, the number of columns doesn't have to be 6, it can be arbitrary as long as the first four elements of each row are `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element is the score assigned to the prediction. Note that this function is agnostic to the scale of the score or what it represents. iou_threshold (float, optional): All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions, where 'maximal' refers to the box score. coords (str, optional): The coordinate format of `y_pred_decoded`. Can be one of the formats supported by `iou()`. border_pixels (str, optional): How to treat the border pixels of the bounding boxes. Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong to the boxes. If 'exclude', the border pixels do not belong to the boxes. If 'half', then one of each of the two horizontal and vertical borders belong to the boxex, but not the other. Returns: The predictions after removing non-maxima. The format is the same as the input format. ''' y_pred_decoded_nms = [] for batch_item in y_pred_decoded: # For the labels of each batch item... boxes_left = np.copy(batch_item) maxima = [] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box y_pred_decoded_nms.append(np.array(maxima)) return y_pred_decoded_nms
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'): ''' 进行贪心 non-maximum suppression. 选择最高概率的边界框, 去处和此边界框的IoU较大的其它框. 在余下的框中再选最高概率的框, 去处与其IoU较大的其它框. 重复, 直到所有的框之间都没有太大的IoU. Arguments: y_pred_decoded (list): 已经解码的预测值. 如果batch_size为 `n`, 这个list包含 `n` 个 2 维 Numpy 数组. 如果一个图像有 `k` 个预测的边界框, 这个2 维 Numpy 数组的形状为 `(k, 6)`, 这个数组的每一行的值为 `[class_id, score, xmin, xmax, ymin, ymax]`. iou_threshold (float, optional): 所有与具有最大概率的边界框重合度大于 `iou_threshold` 都会被筛除 coords (str, optional): `y_pred_decoded` 的坐标格式. border_pixels (str, optional): 如何处理位于边界框边界的像素. 其值可以为 'include', 'exclude', or 'half'. Returns: 去除了 non-maxima 之后的结果. 输出的坐标格式和输入一样 ''' y_pred_decoded_nms = [] for batch_item in y_pred_decoded: # 对每一幅图 ... boxes_left = np.copy(batch_item) maxima = [] # 用于保存通过 non-maximum suppression 后保留的边界框 while boxes_left.shape[0] > 0: # 还有余下的边界框... maximum_index = np.argmax(boxes_left[:, 1]) # ...得到下一个具有最大概率的边界框... maximum_box = np.copy(boxes_left[maximum_index]) # ...拷贝... maxima.append( maximum_box) # ...将 `maximum_box` 放入 `maxima`, 因为一定会保留 boxes_left = np.delete( boxes_left, maximum_index, axis=0) # 将 `maximum_box` 从 `boxes_left` 中删除 if boxes_left.shape[0] == 0: break # 如果没有余下的边界框了, break. 否则... similarities = iou(boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...比较IoU... boxes_left = boxes_left[ similarities <= iou_threshold] # ...去除与 `maximum box` 较大的边界框 y_pred_decoded_nms.append(np.array(maxima)) return y_pred_decoded_nms
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): ''' The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output box and is thus useful for debugging. ''' boxes_left = np.copy(predictions) maxima = [] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima)
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'): ''' Arguments: y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this is a list of length `n` where each list element is a 2D Numpy array. For a batch item with `k` predicted boxes this 2D Numpy array has shape `(k, 6)`, where each row contains the coordinates of the respective box in the format `[class_id, score, xmin, xmax, ymin, ymax]`. Technically, the number of columns doesn't have to be 6, it can be arbitrary as long as the first four elements of each row are `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element is the score assigned to the prediction. Note that this function is agnostic to the scale of the score or what it represents. iou_threshold (float, optional): All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed from the set of predictions, where 'maximal' refers to the box score. coords (str, optional): The coordinate format of `y_pred_decoded`. Can be one of the formats supported by `iou()`. border_pixels (str, optional): How to treat the border pixels of the bounding boxes. Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong to the boxes. If 'exclude', the border pixels do not belong to the boxes. If 'half', then one of each of the two horizontal and vertical borders belong to the boxex, but not the other. Returns: The predictions after removing non-maxima. The format is the same as the input format. ''' y_pred_decoded_nms = [] for batch_item in y_pred_decoded: # For the labels of each batch item... boxes_left = np.copy(batch_item) #留下来的boxes maxima = [] # 存储nms留下来的box while boxes_left.shape[0] > 0: # 仍然有需要比较的boxes maximum_index = np.argmax(boxes_left[:,1]) # ...得到下一个分数最高的box的索引 maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... maxima.append(maximum_box) # 将最大的box存储至maximum_box boxes_left = np.delete(boxes_left, maximum_index, axis=0) # 从 `boxes_left`中移除最大的box if boxes_left.shape[0] == 0: break # 如果boxes_left中为空,停止循环. similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # .计算剩余boxes和最大box的iou boxes_left = boxes_left[similarities <= iou_threshold] # 移除和最大box的iou超出阈值的box y_pred_decoded_nms.append(np.array(maxima)) return y_pred_decoded_nms
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): ''' 和以上non-maximum suppression 一样, 修改用来在`decode_detections()`中为某一个类别做 non-maximum suppression. ''' boxes_left = np.copy(predictions) maxima = [] while boxes_left.shape[0] > 0: maximum_index = np.argmax(boxes_left[:, 0]) maximum_box = np.copy(boxes_left[maximum_index]) maxima.append(maximum_box) boxes_left = np.delete(boxes_left, maximum_index, axis=0) if boxes_left.shape[0] == 0: break similarities = iou(boxes_left[:, 1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) boxes_left = boxes_left[similarities <= iou_threshold] return np.array(maxima)
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): """ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal function for per-class NMS in `decode_detections()`. Args: predictions: (np.array) 某个 batch_item 的 某个 class 的所有 confident prediction(大于阈值的所有 prediction). shape 为 (num_conf_prediction, 5), 最后一维度的元素表示 confidence, xmin, ymin, xmax, ymax """ boxes_left = np.copy(predictions) # This is where we store the boxes that make it through the non-maximum suppression maxima = [] # While there are still boxes left to compare while boxes_left.shape[0] > 0: # get the index of the next box with the highest confidence... maximum_index = np.argmax(boxes_left[:, 0]) # copy that box maximum_box = np.copy(boxes_left[maximum_index]) # append it to `maxima` because we'll definitely keep it maxima.append(maximum_box) # Now remove the maximum box from `boxes_left` boxes_left = np.delete(boxes_left, maximum_index, axis=0) # If there are no boxes left after this step, break. if boxes_left.shape[0] == 0: break similarities = iou(boxes_left[:, 1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # Otherwise compare (IoU) the other left over boxes to the maximum box... # so that we can remove the ones that overlap too much with the maximum box boxes_left = boxes_left[similarities <= iou_threshold] return np.array(maxima)
def __call__(self, labels, image_height=None, image_width=None): """ Arguments: labels (np.array): The labels to be filtered. This is an array with shape `(m,n)`, where `m` is the number of bounding boxes and `n` is the number of elements that defines each bounding box (box coordinates, class ID, etc.). The box coordinates are expected to be in the image's coordinate system. image_height (int): Only relevant if `check_overlap == True`. The height of the image (in pixels) to compare the box coordinates to. image_width (int): Only relevant if `check_overlap == True`. The width of the image (in pixels) to compare the box coordinates to. Returns: An array containing the labels of all boxes that are valid. """ labels = np.copy(labels) xmin = self.labels_format.index('xmin') ymin = self.labels_format.index('ymin') xmax = self.labels_format.index('xmax') ymax = self.labels_format.index('ymax') # Record the boxes that pass all checks here. # 用于标记是否通过检查 requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool) if self.check_degenerate: non_degenerate = (labels[:, xmax] > labels[:, xmin]) * ( labels[:, ymax] > labels[:, ymin]) requirements_met *= non_degenerate if self.check_min_area: min_area_met = (labels[:, xmax] - labels[:, xmin]) * ( labels[:, ymax] - labels[:, ymin]) >= self.min_area requirements_met *= min_area_met if self.check_overlap: # Get the lower and upper bounds. if isinstance(self.overlap_bounds, BoundGenerator): lower, upper = self.overlap_bounds() else: lower, upper = self.overlap_bounds # Compute which boxes are valid. if self.overlap_criterion == 'iou': # Compute the patch coordinates. # UNCLEAR: why not np.array([0, 0, image_width - 1, image_height - 1]) image_coords = np.array([0, 0, image_width, image_height]) # Compute the IoU between the patch and all of the ground truth boxes. # shape 为 (num_boxes, ) image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels) # Check which boxes meet the overlap requirements. # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, # hence the ">" sign instead of the ">=" sign. if lower == 0.0: mask_lower = image_boxes_iou > lower # Especially for the case `self.lower == 1` we want the ">=" sign, # otherwise no boxes would count at all. else: mask_lower = image_boxes_iou >= lower mask_upper = image_boxes_iou <= upper requirements_met *= mask_lower * mask_upper elif self.overlap_criterion == 'area': if self.border_pixels == 'half': d = 0 # If border pixels are supposed to belong to the bounding boxes, we have to # add one pixel to any difference `x_max - x_min` or `y_max - y_min`. elif self.border_pixels == 'include': d = 1 # If border pixels are not supposed to belong to the bounding boxes, we have to # subtract one pixel from any difference `x_max - x_min` or `y_max - y_min`. else: d = -1 # Compute the areas of the boxes. box_areas = (labels[:, xmax] - labels[:, xmin] + d) * (labels[:, ymax] - labels[:, ymin] + d) # Compute the intersection area between the patch and all of the ground truth boxes. clipped_boxes = np.copy(labels) clipped_boxes[:, [ymin, ymax]] = np.clip(labels[:, [ymin, ymax]], a_min=0, a_max=image_height - 1) clipped_boxes[:, [xmin, xmax]] = np.clip(labels[:, [xmin, xmax]], a_min=0, a_max=image_width - 1) intersection_areas = ( clipped_boxes[:, xmax] - clipped_boxes[:, xmin] + d) * (clipped_boxes[:, ymax] - clipped_boxes[:, ymin] + d) # Check which boxes meet the overlap requirements. # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, # hence the ">" sign instead of the ">=" sign. if lower == 0.0: mask_lower = intersection_areas > lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, # otherwise no boxes would count at all. else: mask_lower = intersection_areas >= lower * box_areas mask_upper = intersection_areas <= upper * box_areas requirements_met *= mask_lower * mask_upper elif self.overlap_criterion == 'center_point': # Compute the center points of the boxes. cy = (labels[:, ymin] + labels[:, ymax]) / 2 cx = (labels[:, xmin] + labels[:, xmax]) / 2 # Check which of the boxes have center points within the cropped patch remove those that don't. requirements_met *= (cy >= 0.0) * (cy <= image_height - 1) * ( cx >= 0.0) * (cx <= image_width - 1) return labels[requirements_met]
def __call__(self, ground_truth_labels): ''' 将真实数据转换成训练需要的格式 参数:ground_truth_labels (list):(class_id, xmin, ymin, xmax, ymax) 返回:y_encoded, (batch_size, #boxes, #classes + 4 + 4 + 4) ''' #1. 真实标签顺序 class_id = 0 xmin = 1 ymin = 2 xmax = 3 ymax = 4 batch_size = len(ground_truth_labels) # 整理anchor box的格式(batch_size, #boxes, #classes + 12) y_encoded = self.generate_encoding_template(batch_size=batch_size) # 匹配真实box和anchor box y_encoded[:, :, self.background_id] = 1 # 所有boxes默认为背景. n_boxes = y_encoded.shape[1] class_vectors = np.eye(self.n_classes) # one-hot class vectors for i in range(batch_size): # For each batch item... if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match. labels = ground_truth_labels[i].astype( np.float) # The labels for this batch item # Check for degenerate ground truth bounding boxes before attempting any computations. if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any( labels[:, [ymax]] - labels[:, [ymin]] <= 0): raise DegenerateBoxError( "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, " .format(i, labels) + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " + "bounding boxes will lead to NaN errors during the training." ) # normalize if self.normalize_coords: labels[:, [ymin, ymax]] /= self.img_height labels[:, [xmin, xmax]] /= self.img_width # 可能需要转换坐标格式 if self.coords == 'centroids': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels) elif self.coords == 'minmax': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax') classes_one_hot = class_vectors[labels[:, class_id].astype( np.int )] # The one-hot class IDs for the ground truth boxes of this batch item labels_one_hot = np.concatenate( [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]], axis=-1 ) # The one-hot version of the labels for this batch item # 计算IoU `(num_ground_truth_boxes, num_anchor_boxes)`. similarities = iou(labels[:, [xmin, ymin, xmax, ymax]], y_encoded[i, :, -12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels) # 1. 找到和每个真实框IOU最高的一个default box,这里保证了每一个真实框将至少匹配到一个default box. bipartite_matches = match_bipartite_greedy( weight_matrix=similarities) # 将真实标签写入匹配到的default boxes中 y_encoded[i, bipartite_matches, :-8] = labels_one_hot # 将匹配到的default box设为0,表示已经匹配 similarities[:, bipartite_matches] = 0 #2. 剩余的default box会寻找与其IOU最大的真实框,如果IOU大于阈值pos_iou_threshold,匹配成功 if self.matching_type == 'multi': matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold) y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]] similarities[:, matches[1]] = 0 # 最后: 剩下的框中如果有IOU大于neg_iou_limit,设置为中立,因为和真实框比较接近,不适合作为背景类参与训练 max_background_similarities = np.amax(similarities, axis=0) neutral_boxes = np.nonzero( max_background_similarities >= self.neg_iou_limit)[0] y_encoded[i, neutral_boxes, self.background_id] = 0 # 2.将坐标转换成偏移值 if self.coords == 'centroids': y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [ -8, -7 ]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor) y_encoded[:, :, [ -12, -11 ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [ -4, -3 ]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [ -6, -5 ]] # w(gt) / w(anchor), h(gt) / h(anchor) y_encoded[:, :, [-10, -9]] = np.log( y_encoded[:, :, [-10, -9]] ) / y_encoded[:, :, [ -2, -1 ]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) elif self.coords == 'corners': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -10]] /= np.expand_dims( y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-11, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively elif self.coords == 'minmax': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -11]] /= np.expand_dims( y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-10, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively return y_encoded
def __call__(self, ground_truth_labels, diagnostics=False): ''' Converts ground truth bounding box data into a suitable format to train an SSD model. Arguments: ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging to the respective image, and the data for each ground truth bounding box has the format `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be an integer greater than 0 for all boxes as class ID 0 is reserved for the background class. diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned, but also a copy of it with anchor box coordinates in place of the ground truth coordinates. This can be very useful if you want to visualize which anchor boxes got matched to which ground truth boxes. Returns: `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in the last axis are the box coordinates, the next four elements after that are just dummy elements, and the last four elements are the variances. ''' # Mapping to define which indices represent which coordinates in the ground truth. class_id = 0 xmin = 1 ymin = 2 xmax = 3 ymax = 4 batch_size = len(ground_truth_labels) ################################################################################## # Generate the template for y_encoded. ################################################################################## y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False) ################################################################################## # Match ground truth boxes to anchor boxes. ################################################################################## # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have # a ground truth match and for which the maximal IoU overlap with any ground truth box is less # than or equal to `neg_iou_limit` will be a negative (background) box. y_encoded[:, :, self. background_id] = 1 # All boxes are background boxes by default. n_boxes = y_encoded.shape[ 1] # The total number of boxes that the model predicts per batch item class_vectors = np.eye( self.n_classes ) # An identity matrix that we'll use as one-hot class vectors for i in range(batch_size): # For each batch item... if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match. labels = ground_truth_labels[i].astype( np.float) # The labels for this batch item # Check for degenerate ground truth bounding boxes before attempting any computations. if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any( labels[:, [ymax]] - labels[:, [ymin]] <= 0): raise DegenerateBoxError( "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, " .format(i, labels) + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " + "bounding boxes will lead to NaN errors during the training." ) # Maybe normalize the box coordinates. if self.normalize_coords: labels[:, [ ymin, ymax ]] /= self.img_height # Normalize ymin and ymax relative to the image height labels[:, [ xmin, xmax ]] /= self.img_width # Normalize xmin and xmax relative to the image width # Maybe convert the box coordinate format. if self.coords == 'centroids': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels) elif self.coords == 'minmax': labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax') classes_one_hot = class_vectors[labels[:, class_id].astype( np.int )] # The one-hot class IDs for the ground truth boxes of this batch item labels_one_hot = np.concatenate( [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]], axis=-1 ) # The one-hot version of the labels for this batch item # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item. # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`. similarities = iou(labels[:, [xmin, ymin, xmax, ymax]], y_encoded[i, :, -12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels) # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU. # This ensures that each ground truth box will have at least one good match. # For each ground truth box, get the anchor box to match with it. bipartite_matches = match_bipartite_greedy( weight_matrix=similarities) # Write the ground truth data to the matched anchor boxes. y_encoded[i, bipartite_matches, :-8] = labels_one_hot # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, bipartite_matches] = 0 # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar # ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no # such ground truth box. if self.matching_type == 'multi': # Get all matches that satisfy the IoU threshold. matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold) # Write the ground truth data to the matched anchor boxes. y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]] # Set the columns of the matched anchor boxes to zero to indicate that they were matched. similarities[:, matches[1]] = 0 # Third: Now after the matching is done, all negative (background) anchor boxes that have # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral, # i.e. they will no longer be background boxes. These anchors are "too close" to a # ground truth box to be valid background boxes. max_background_similarities = np.amax(similarities, axis=0) neutral_boxes = np.nonzero( max_background_similarities >= self.neg_iou_limit)[0] y_encoded[i, neutral_boxes, self.background_id] = 0 ################################################################################## # Convert box coordinates to anchor box offsets. ################################################################################## if self.coords == 'centroids': y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [ -8, -7 ]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor) y_encoded[:, :, [ -12, -11 ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [ -4, -3 ]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [ -6, -5 ]] # w(gt) / w(anchor), h(gt) / h(anchor) y_encoded[:, :, [-10, -9]] = np.log( y_encoded[:, :, [-10, -9]] ) / y_encoded[:, :, [ -2, -1 ]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) elif self.coords == 'corners': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -10]] /= np.expand_dims( y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-11, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively elif self.coords == 'minmax': y_encoded[:, :, -12: -8] -= y_encoded[:, :, -8: -4] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -11]] /= np.expand_dims( y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-10, -9]] /= np.expand_dims( y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encoded[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively if diagnostics: # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates). y_matched_anchors = np.copy(y_encoded) y_matched_anchors[:, :, -12: -8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero. return y_encoded, y_matched_anchors else: return y_encoded
def match_predictions(self, ignore_neutral_boxes=True, matching_iou_threshold=0.5, border_pixels='include', sorting_algorithm='quicksort', verbose=True, ret=False): ''' Matches predictions to ground truth boxes. Note that `predict_on_dataset()` must be called before calling this method. Arguments: ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`, neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation. matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap of at least `matching_iou_threshold` with any ground truth bounding box of the same class. border_pixels (str, optional): How to treat the border pixels of the bounding boxes. Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong to the boxes. If 'exclude', the border pixels do not belong to the boxes. If 'half', then one of each of the two horizontal and vertical borders belong to the boxex, but not the other. sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort' (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable). The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically even if you choose 'quicksort' (but no guarantees). verbose (bool, optional): If `True`, will print out the progress during runtime. ret (bool, optional): If `True`, returns the true and false positives. Returns: None by default. Optionally, four nested lists containing the true positives, false positives, cumulative true positives, and cumulative false positives for each class. ''' if self.data_generator.labels is None: raise ValueError("Matching predictions to ground truth boxes not possible, no ground truth given.") if self.prediction_results is None: raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.") class_id_gt = self.gt_format['class_id'] xmin_gt = self.gt_format['xmin'] ymin_gt = self.gt_format['ymin'] xmax_gt = self.gt_format['xmax'] ymax_gt = self.gt_format['ymax'] # Convert the ground truth to a more efficient format for what we need # to do, which is access ground truth by image ID repeatedly. ground_truth = {} eval_neutral_available = not (self.data_generator.eval_neutral is None) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not. for i in range(len(self.data_generator.image_ids)): image_id = str(self.data_generator.image_ids[i]) labels = self.data_generator.labels[i] if ignore_neutral_boxes and eval_neutral_available: ground_truth[image_id] = (np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i])) else: ground_truth[image_id] = np.asarray(labels) true_positives = [[]] # The false positives for each class, sorted by descending confidence. false_positives = [[]] # The true positives for each class, sorted by descending confidence. cumulative_true_positives = [[]] cumulative_false_positives = [[]] # Iterate over all classes. for class_id in range(1, self.n_classes + 1): predictions = self.prediction_results[class_id] # Store the matching results in these lists: true_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise false_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise # In case there are no predictions at all for this class, we're done here. if len(predictions) == 0: print("No predictions for class {}/{}".format(class_id, self.n_classes)) true_positives.append(true_pos) false_positives.append(false_pos) continue # Convert the predictions list for this class into a structured array so that we can sort it by confidence. # Get the number of characters needed to store the image ID strings in the structured array. num_chars_per_image_id = len(str(predictions[0][0])) + 6 # Keep a few characters buffer in case some image IDs are longer than others. # Create the data type for the structured array. preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)), ('confidence', 'f4'), ('xmin', 'f4'), ('ymin', 'f4'), ('xmax', 'f4'), ('ymax', 'f4')]) # Create the structured array predictions = np.array(predictions, dtype=preds_data_type) # Sort the detections by decreasing confidence. descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm) predictions_sorted = predictions[descending_indices] if verbose: tr = trange(len(predictions), file=sys.stdout) tr.set_description("Matching predictions to ground truth, class {}/{}.".format(class_id, self.n_classes)) else: tr = range(len(predictions.shape)) # Keep track of which ground truth boxes were already matched to a detection. gt_matched = {} # Iterate over all predictions. for i in tr: prediction = predictions_sorted[i] image_id = prediction['image_id'] pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']])) # Convert the structured array element to a regular array. # Get the relevant ground truth boxes for this prediction, # i.e. all ground truth boxes that match the prediction's # image ID and class ID. # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)` # or only `ground_truth_boxes`. if ignore_neutral_boxes and eval_neutral_available: gt, eval_neutral = ground_truth[image_id] else: gt = ground_truth[image_id] gt = np.asarray(gt) class_mask = gt[:,class_id_gt] == class_id gt = gt[class_mask] if ignore_neutral_boxes and eval_neutral_available: eval_neutral = eval_neutral[class_mask] if gt.size == 0: # If the image doesn't contain any objects of this class, # the prediction becomes a false positive. false_pos[i] = 1 continue # Compute the IoU of this prediction with all ground truth boxes of the same class. overlaps = iou(boxes1=gt[:,[xmin_gt, ymin_gt, xmax_gt, ymax_gt]], boxes2=pred_box, coords='corners', mode='element-wise', border_pixels=border_pixels) # For each detection, match the ground truth box with the highest overlap. # It's possible that the same ground truth box will be matched to multiple # detections. gt_match_index = np.argmax(overlaps) gt_match_overlap = overlaps[gt_match_index] if gt_match_overlap < matching_iou_threshold: # False positive, IoU threshold violated: # Those predictions whose matched overlap is below the threshold become # false positives. false_pos[i] = 1 else: if not (ignore_neutral_boxes and eval_neutral_available) or (eval_neutral[gt_match_index] == False): # If this is not a ground truth that is supposed to be evaluation-neutral # (i.e. should be skipped for the evaluation) or if we don't even have the # concept of neutral boxes. if not (image_id in gt_matched): # True positive: # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool) gt_matched[image_id][gt_match_index] = True elif not gt_matched[image_id][gt_match_index]: # True positive: # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 gt_matched[image_id][gt_match_index] = True else: # False positive, duplicate detection: # If the matched ground truth box for this prediction has already been matched # to a different prediction previously, it is a duplicate detection for an # already detected object, which counts as a false positive. false_pos[i] = 1 true_positives.append(true_pos) false_positives.append(false_pos) cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives cumulative_true_positives.append(cumulative_true_pos) cumulative_false_positives.append(cumulative_false_pos) self.true_positives = true_positives self.false_positives = false_positives self.cumulative_true_positives = cumulative_true_positives self.cumulative_false_positives = cumulative_false_positives if ret: return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives
def __call__(self, labels, image_height=None, image_width=None): ''' Arguments: labels (array): (m,n) m是bbox的数量,n是bbox中元素的数量 image_height (int): Only relevant if `check_overlap == True`. The height of the image (in pixels) to compare the box coordinates to. image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare the box coordinates to. Returns: An array containing the labels of all boxes that are valid. ''' labels = np.copy(labels) xmin = self.labels_format['xmin'] ymin = self.labels_format['ymin'] xmax = self.labels_format['xmax'] ymax = self.labels_format['ymax'] # Record the boxes that pass all checks here. requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool) if self.check_degenerate: non_degenerate = (labels[:, xmax] > labels[:, xmin]) * ( labels[:, ymax] > labels[:, ymin]) requirements_met *= non_degenerate if self.check_min_area: min_area_met = (labels[:, xmax] - labels[:, xmin]) * ( labels[:, ymax] - labels[:, ymin]) >= self.min_area requirements_met *= min_area_met if self.check_overlap: # Get the lower and upper bounds. if isinstance(self.overlap_bounds, BoundGenerator): lower, upper = self.overlap_bounds() else: lower, upper = self.overlap_bounds # 计算有效的bboxes if self.overlap_criterion == 'iou': # Compute the patch coordinates. image_coords = np.array([0, 0, image_width, image_height]) # Compute the IoU between the patch and all of the ground truth boxes. image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels) requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper) elif self.overlap_criterion == 'area': if self.border_pixels == 'half': d = 0 elif self.border_pixels == 'include': d = 1 # `xmax - xmin + 1` or `ymax - ymin + 1`. elif self.border_pixels == 'exclude': d = -1 # `xmax - xmin - 1` or `ymax - ymin - 1`. # Compute the areas of the boxes. box_areas = (labels[:, xmax] - labels[:, xmin] + d) * (labels[:, ymax] - labels[:, ymin] + d) # Compute the intersection area between the patch and all of the ground truth boxes. clipped_boxes = np.copy(labels) clipped_boxes[:, [ymin, ymax]] = np.clip(labels[:, [ymin, ymax]], a_min=0, a_max=image_height - 1) clipped_boxes[:, [xmin, xmax]] = np.clip(labels[:, [xmin, xmax]], a_min=0, a_max=image_width - 1) intersection_areas = ( clipped_boxes[:, xmax] - clipped_boxes[:, xmin] + d) * ( clipped_boxes[:, ymax] - clipped_boxes[:, ymin] + d ) # +1 because the border pixels belong to the box areas. # Check which boxes meet the overlap requirements. if lower == 0.0: mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign. else: mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all. mask_upper = intersection_areas <= upper * box_areas requirements_met *= mask_lower * mask_upper elif self.overlap_criterion == 'center_point': # Compute the center points of the boxes. cy = (labels[:, ymin] + labels[:, ymax]) / 2 cx = (labels[:, xmin] + labels[:, xmax]) / 2 # Check which of the boxes have center points within the cropped patch remove those that don't. requirements_met *= (cy >= 0.0) * (cy <= image_height - 1) * ( cx >= 0.0) * (cx <= image_width - 1) return labels[requirements_met]
for boxp in y_pred_decoded[i]: # if the box predicted a bond not cut if boxp[0] == 1: # We consider the predicted box as not matched until we can prove it is matched = False max_iou = 0 ind = -1 # for every ground truth box for row, boxgt in enumerate(gt_labels): # if the bond is not cut if boxgt[0] == 1: # calculate the iou between the predicted box and the ground truth box inter_over_union = iou(boxp[2:], boxgt[1:]) # if the boxes are close enough we consider the prediction as a good prediction => a true positive if inter_over_union > max_iou: max_iou = inter_over_union ind = row # if one of the boxes is close enough we consider the prediction as a good prediction => a true positive if max_iou > iou_threshold: true_positive += 1 # we remove the ground truth box so that it is not matched twice gt_labels = np.delete(gt_labels, ind, 0) # we now consider the predicted box as matched
def __call__(self, ground_truth_labels, diagnostics=False): ''' Converts ground truth bounding box data into a suitable format to train an SSD model. For each image in the batch, each ground truth bounding box belonging to that image will be compared against each anchor box in a template with respect to their jaccard similarity. If the jaccard similarity is greater than or equal to the set threshold, the boxes will be matched, meaning that the ground truth box coordinates and class will be written to the the specific position of the matched anchor box in the template. The class for all anchor boxes for which there was no match with any ground truth box will be set to the background class, except for those anchor boxes whose IoU similarity with any ground truth box is higher than the set negative upper bound (see the `neg_iou_limit` argument in `__init__()`). Arguments: ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging to the respective image, and the data for each ground truth bounding box has the format `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be an integer greater than 0 for all boxes as class ID 0 is reserved for the background class. diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned, but also a copy of it with anchor box coordinates in place of the ground truth coordinates. This can be very useful if you want to visualize which anchor boxes got matched to which ground truth boxes. Returns: `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in the last axis are the box coordinates, the next four elements after that are just dummy elements, and the last four elements are the variances. ''' # 1: Generate the template for y_encoded y_encode_template = self.generate_encode_template( batch_size=len(ground_truth_labels), diagnostics=False) y_encoded = np.copy( y_encode_template ) # We'll write the ground truth box data to this array # 2: Match the boxes from `ground_truth_labels` to the anchor boxes in `y_encode_template` # and for each matched box record the ground truth coordinates in `y_encoded`. # Every time there is no match for a anchor box, record `class_id` 0 in `y_encoded` for that anchor box. class_vector = np.eye( self.n_classes ) # An identity matrix that we'll use as one-hot class vectors for i in range(y_encode_template.shape[0]): # For each batch item... available_boxes = np.ones( (y_encode_template.shape[1]) ) # 1 for all anchor boxes that are not yet matched to a ground truth box, 0 otherwise negative_boxes = np.ones( (y_encode_template.shape[1] )) # 1 for all negative boxes, 0 otherwise for true_box in ground_truth_labels[ i]: # For each ground truth box belonging to the current batch item... true_box = true_box.astype(np.float) if abs(true_box[3] - true_box[1] < 0.001) or abs( true_box[4] - true_box[2] < 0.001): continue # Protect ourselves against bad ground truth data: boxes with width or height equal to zero if self.normalize_coords: true_box[[ 1, 3 ]] /= self.img_width # Normalize xmin and xmax to be within [0,1] true_box[[ 2, 4 ]] /= self.img_height # Normalize ymin and ymax to be within [0,1] if self.coords == 'centroids': true_box = convert_coordinates( true_box, start_index=1, conversion='corners2centroids') elif self.coords == 'minmax': true_box = convert_coordinates(true_box, start_index=1, conversion='corners2minmax') similarities = iou( y_encode_template[i, :, -12:-8], true_box[1:], coords=self.coords ) # The iou similarities for all anchor boxes negative_boxes[ similarities >= self. neg_iou_limit] = 0 # If a negative box gets an IoU match >= `self.neg_iou_limit`, it's no longer a valid negative box similarities *= available_boxes # Filter out anchor boxes which aren't available anymore (i.e. already matched to a different ground truth box) available_and_thresh_met = np.copy(similarities) available_and_thresh_met[ available_and_thresh_met < self. pos_iou_threshold] = 0 # Filter out anchor boxes which don't meet the iou threshold assign_indices = np.nonzero( available_and_thresh_met )[0] # Get the indices of the left-over anchor boxes to which we want to assign this ground truth box if len(assign_indices) > 0: # If we have any matches y_encoded[i, assign_indices, :-8] = np.concatenate( (class_vector[int(true_box[0])], true_box[1:]), axis=0 ) # Write the ground truth box coordinates and class to all assigned anchor box positions. Remember that the last four elements of `y_encoded` are just dummy entries. available_boxes[ assign_indices] = 0 # Make the assigned anchor boxes unavailable for the next ground truth box else: # If we don't have any matches best_match_index = np.argmax( similarities ) # Get the index of the best iou match out of all available boxes y_encoded[i, best_match_index, :-8] = np.concatenate( (class_vector[int(true_box[0])], true_box[1:]), axis=0 ) # Write the ground truth box coordinates and class to the best match anchor box position available_boxes[ best_match_index] = 0 # Make the assigned anchor box unavailable for the next ground truth box negative_boxes[ best_match_index] = 0 # The assigned anchor box is no longer a negative box # Set the classes of all remaining available anchor boxes to class zero background_class_indices = np.nonzero(negative_boxes)[0] y_encoded[i, background_class_indices, 0] = 1 # 3: Convert absolute box coordinates to offsets from the anchor boxes and normalize them if self.coords == 'centroids': y_encoded[:, :, [-12, -11]] -= y_encode_template[:, :, [ -12, -11 ]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor) y_encoded[:, :, [-12, -11]] /= y_encode_template[:, :, [ -10, -9 ]] * y_encode_template[:, :, [ -4, -3 ]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance y_encoded[:, :, [-10, -9]] /= y_encode_template[:, :, [ -10, -9 ]] # w(gt) / w(anchor), h(gt) / h(anchor) y_encoded[:, :, [-10, -9]] = np.log( y_encoded[:, :, [-10, -9]] ) / y_encode_template[:, :, [ -2, -1 ]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) elif self.coords == 'corners': y_encoded[:, :, -12: -8] -= y_encode_template[:, :, -12: -8] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -10]] /= np.expand_dims( y_encode_template[:, :, -10] - y_encode_template[:, :, -12], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-11, -9]] /= np.expand_dims( y_encode_template[:, :, -9] - y_encode_template[:, :, -11], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encode_template[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively elif self.coords == 'minmax': y_encoded[:, :, -12: -8] -= y_encode_template[:, :, -12: -8] # (gt - anchor) for all four coordinates y_encoded[:, :, [-12, -11]] /= np.expand_dims( y_encode_template[:, :, -11] - y_encode_template[:, :, -12], axis=-1 ) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) y_encoded[:, :, [-10, -9]] /= np.expand_dims( y_encode_template[:, :, -9] - y_encode_template[:, :, -10], axis=-1 ) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) y_encoded[:, :, -12: -8] /= y_encode_template[:, :, -4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively if diagnostics: # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates). y_matched_anchors = np.copy(y_encoded) y_matched_anchors[:, :, -12: -8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero. return y_encoded, y_matched_anchors else: return y_encoded
def __call__(self, image_height, image_width, labels): ''' Arguments: image_height (int): The height of the image to compare the box coordinates to. image_width (int): The width of the image to compare the box coordinates to. labels (array): The labels to be tested. The box coordinates are expected to be in the image's coordinate system. Returns: An array containing the labels of all boxes that are valid. ''' labels = np.copy(labels) xmin = self.labels_format['xmin'] ymin = self.labels_format['ymin'] xmax = self.labels_format['xmax'] ymax = self.labels_format['ymax'] # Get the lower and upper bounds. if isinstance(self.bounds, BoundGenerator): lower, upper = self.bounds() else: lower, upper = self.bounds # Compute which boxes are valid. if self.overlap_criterion == 'iou': # Compute the patch coordinates. image_coords = np.array([0, 0, image_width, image_height]) # Compute the IoU between the patch and all of the ground truth boxes. image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners') requirements_met = (image_boxes_iou > lower) * (image_boxes_iou <= upper) elif self.overlap_criterion == 'area': # Compute the areas of the boxes. box_areas = (labels[:,xmax] - labels[:,xmin]) * (labels[:,ymax] - labels[:,ymin]) # Compute the intersection area between the patch and all of the ground truth boxes. clipped_boxes = np.copy(labels) clipped_boxes[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=image_height-1) clipped_boxes[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=image_width-1) intersection_areas = (clipped_boxes[:,xmax] - clipped_boxes[:,xmin]) * (clipped_boxes[:,ymax] - clipped_boxes[:,ymin]) # Check which boxes meet the overlap requirements. if lower == 0.0: mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign. else: mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all. mask_upper = intersection_areas <= upper * box_areas requirements_met = mask_lower * mask_upper elif self.overlap_criterion == 'center_point': # Compute the center points of the boxes. cy = (labels[:,ymin] + labels[:,ymax]) / 2 cx = (labels[:,xmin] + labels[:,xmax]) / 2 # Check which of the boxes have center points within the cropped patch remove those that don't. requirements_met = (cy >= 0.0) * (cy <= image_height-1) * (cx >= 0.0) * (cx <= image_width-1) return labels[requirements_met]
print('Num of gt buses : {}'.format(len(gt_label_ls[i][0]))) if len(gt_label_ls[i][0]) > 1: a = gt_label_ls[i][0][:, 1:] gt_cls = gt_label_ls[i][0][:, 0] else: a = gt_label_ls[i][0][0][1:] gt_cls = gt_label_ls[i][0][:, 0] if len(y_pred_thresh[i]) == 0: print('No IOU') continue if len(y_pred_thresh[i]) > 1: b = y_pred_thresh[i][:, 2:] p_cls = y_pred_thresh[i][:, 0] else: b = y_pred_thresh[i][0][2:] p_cls = y_pred_thresh[i][:, 0] iou_score = iou(np.array(a), np.array(b), coords='corners', mode='outer_product', border_pixels='half') if len(iou_score) == 0: print('No IOU') else: for row in iou_score: print('IOU is {} '.format(max(row))) print('gt class is {} predicted class is {}'.format( gt_cls, p_cls))
def match_predictions(self, ignore_neutral_boxes=True, matching_iou_threshold=0.5, border_pixels='include', sorting_algorithm='quicksort', verbose=True, ret=False): if self.data_generator.labels is None: raise ValueError( "Matching predictions to ground truth boxes not possible, no ground truth given." ) if self.prediction_results is None: raise ValueError( "There are no prediction results. You must run `predict_on_dataset()` before calling this method." ) class_id_gt = self.gt_format['class_id'] xmin_gt = self.gt_format['xmin'] ymin_gt = self.gt_format['ymin'] xmax_gt = self.gt_format['xmax'] ymax_gt = self.gt_format['ymax'] # Convert the ground truth to a more efficient format for what we need # to do, which is access ground truth by image ID repeatedly. ground_truth = {} eval_neutral_available = not ( self.data_generator.eval_neutral is None ) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not. for i in range(len(self.data_generator.image_ids)): image_id = str(self.data_generator.image_ids[i]) labels = self.data_generator.labels[i] if ignore_neutral_boxes and eval_neutral_available: ground_truth[image_id] = ( np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i])) else: ground_truth[image_id] = np.asarray(labels) true_positives = [ [] ] # The false positives for each class, sorted by descending confidence. false_positives = [ [] ] # The true positives for each class, sorted by descending confidence. cumulative_true_positives = [[]] cumulative_false_positives = [[]] # Iterate over all classes. for class_id in range(1, self.n_classes + 1): predictions = self.prediction_results[class_id] # Store the matching results in these lists: true_pos = np.zeros( len(predictions), dtype=np.int ) # 1 for every prediction that is a true positive, 0 otherwise false_pos = np.zeros( len(predictions), dtype=np.int ) # 1 for every prediction that is a false positive, 0 otherwise # In case there are no predictions at all for this class, we're done here. if len(predictions) == 0: print("No predictions for class {}/{}".format( class_id, self.n_classes)) true_positives.append(true_pos) false_positives.append(false_pos) continue # Convert the predictions list for this class into a structured array so that we can sort it by confidence. # Get the number of characters needed to store the image ID strings in the structured array. num_chars_per_image_id = len( str(predictions[0][0]) ) + 6 # Keep a few characters buffer in case some image IDs are longer than others. # Create the data type for the structured array. preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)), ('confidence', 'f4'), ('xmin', 'f4'), ('ymin', 'f4'), ('xmax', 'f4'), ('ymax', 'f4')]) # Create the structured array predictions = np.array(predictions, dtype=preds_data_type) # Sort the detections by decreasing confidence. descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm) predictions_sorted = predictions[descending_indices] if verbose: tr = trange(len(predictions), file=sys.stdout) tr.set_description( "Matching predictions to ground truth, class {}/{}.". format(class_id, self.n_classes)) else: tr = range(len(predictions.shape)) # Keep track of which ground truth boxes were already matched to a detection. gt_matched = {} # Iterate over all predictions. for i in tr: prediction = predictions_sorted[i] image_id = prediction['image_id'] pred_box = np.asarray( list(prediction[['xmin', 'ymin', 'xmax', 'ymax']]) ) # Convert the structured array element to a regular array. # Get the relevant ground truth boxes for this prediction, # i.e. all ground truth boxes that match the prediction's # image ID and class ID. # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)` # or only `ground_truth_boxes`. if ignore_neutral_boxes and eval_neutral_available: gt, eval_neutral = ground_truth[image_id] else: gt = ground_truth[image_id] gt = np.asarray(gt) class_mask = gt[:, class_id_gt] == class_id gt = gt[class_mask] if ignore_neutral_boxes and eval_neutral_available: eval_neutral = eval_neutral[class_mask] if gt.size == 0: # If the image doesn't contain any objects of this class, # the prediction becomes a false positive. false_pos[i] = 1 continue # Compute the IoU of this prediction with all ground truth boxes of the same class. overlaps = iou(boxes1=gt[:, [xmin_gt, ymin_gt, xmax_gt, ymax_gt]], boxes2=pred_box, coords='corners', mode='element-wise', border_pixels=border_pixels) # For each detection, match the ground truth box with the highest overlap. # It's possible that the same ground truth box will be matched to multiple # detections. gt_match_index = np.argmax(overlaps) gt_match_overlap = overlaps[gt_match_index] if gt_match_overlap < matching_iou_threshold: # False positive, IoU threshold violated: # Those predictions whose matched overlap is below the threshold become # false positives. false_pos[i] = 1 else: if not (ignore_neutral_boxes and eval_neutral_available ) or (eval_neutral[gt_match_index] == False): # If this is not a ground truth that is supposed to be evaluation-neutral # (i.e. should be skipped for the evaluation) or if we don't even have the # concept of neutral boxes. if not (image_id in gt_matched): # True positive: # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 gt_matched[image_id] = np.zeros( shape=(gt.shape[0]), dtype=np.bool) gt_matched[image_id][gt_match_index] = True elif not gt_matched[image_id][gt_match_index]: # True positive: # If the matched ground truth box for this prediction hasn't been matched to a # different prediction already, we have a true positive. true_pos[i] = 1 gt_matched[image_id][gt_match_index] = True else: # False positive, duplicate detection: # If the matched ground truth box for this prediction has already been matched # to a different prediction previously, it is a duplicate detection for an # already detected object, which counts as a false positive. false_pos[i] = 1 true_positives.append(true_pos) false_positives.append(false_pos) cumulative_true_pos = np.cumsum( true_pos) # Cumulative sums of the true positives cumulative_false_pos = np.cumsum( false_pos) # Cumulative sums of the false positives cumulative_true_positives.append(cumulative_true_pos) cumulative_false_positives.append(cumulative_false_pos) self.true_positives = true_positives self.false_positives = false_positives self.cumulative_true_positives = cumulative_true_positives self.cumulative_false_positives = cumulative_false_positives if ret: return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima) def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_ pixels='half'): ''' an internal function in `decode_detections_fast()`. ''' boxes_left = np.copy(predictions) maxima = [] # This is where we store the boxes that make it through the non-maximum suppression while boxes_left.shape[0] > 0: # While there are still boxes left to compare... maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box return np.array(maxima) def decode_detections(y_pred, confidence_thresh=0.01, iou_threshold=0.45, top_k=200, input_coords='centroids', normalize_coords=True, img_height=None, img_width=None, border_pixels='half'): ''' 将模型预测输出转换成只包含 positive box predictions (i.e. the same format that `SSDInputEncoder` takes as input).