Beispiel #1
0
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
    boxes_left = np.copy(predictions)
    maxima = []  # This is where we store the boxes that make it through the non-maximum suppression
    while boxes_left.shape[0] > 0:  # While there are still boxes left to compare...
        maximum_index = np.argmax(boxes_left[:, 1])  # ...get the index of the next box with the highest confidence...
        maximum_box = np.copy(boxes_left[maximum_index])  # ...copy that box and...
        maxima.append(maximum_box)  # ...append it to `maxima` because we'll definitely keep it
        boxes_left = np.delete(boxes_left, maximum_index, axis=0)  # Now remove the maximum box from `boxes_left`
        if boxes_left.shape[0] == 0: break  # If there are no boxes left after this step, break. Otherwise...
        similarities = iou(boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise',
                           border_pixels=border_pixels)  # ...compare (IoU) the other left over boxes to the maximum box...
        boxes_left = boxes_left[
            similarities <= iou_threshold]  # ...so that we can remove the ones that overlap too much with the maximum box
    return np.array(maxima)
Beispiel #2
0
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
    '''
    和以上non-maximum suppression 一样, 修改用来在`decode_detections()`中为某一个类别做 non-maximum suppression.
    '''
    boxes_left = np.copy(predictions)
    maxima = []
    while boxes_left.shape[0] > 0:
        maximum_index = np.argmax(boxes_left[:, 0])
        maximum_box = np.copy(boxes_left[maximum_index])
        maxima.append(maximum_box)
        boxes_left = np.delete(boxes_left, maximum_index, axis=0)
        if boxes_left.shape[0] == 0: break
        similarities = iou(boxes_left[:, 1:], maximum_box[1:], coords=coords, mode='element-wise',
                           border_pixels=border_pixels)
        boxes_left = boxes_left[similarities <= iou_threshold]
    return np.array(maxima)
Beispiel #3
0
def _greedy_nms_debug(predictions,
                      iou_threshold=0.45,
                      coords='corners',
                      border_pixels='half'):
    '''
    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
    function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
    left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
    box and is thus useful for debugging.
    '''
    boxes_left = np.copy(predictions)
    maxima = [
    ]  # This is where we store the boxes that make it through the non-maximum suppression
    while boxes_left.shape[
            0] > 0:  # While there are still boxes left to compare...
        maximum_index = np.argmax(
            boxes_left[:, 1]
        )  # ...get the index of the next box with the highest confidence...
        maximum_box = np.copy(
            boxes_left[maximum_index])  # ...copy that box and...
        maxima.append(
            maximum_box
        )  # ...append it to `maxima` because we'll definitely keep it
        boxes_left = np.delete(
            boxes_left, maximum_index,
            axis=0)  # Now remove the maximum box from `boxes_left`
        if boxes_left.shape[0] == 0:
            break  # If there are no boxes left after this step, break. Otherwise...
        similarities = iou(
            boxes_left[:, 2:],
            maximum_box[2:],
            coords=coords,
            mode='element-wise',
            border_pixels=border_pixels
        )  # ...compare (IoU) the other left over boxes to the maximum box...
        boxes_left = boxes_left[
            similarities <=
            iou_threshold]  # ...so that we can remove the ones that overlap too much with the maximum box
    return np.array(maxima)
Beispiel #4
0
def _greedy_nms(
        predictions,
        iou_threshold=0.45,
        #coords='corners',
        border_pixels='half'):
    #通过非最大抑制,筛选包围框。
    boxes_left = np.copy(predictions)
    maxima = []  # 这是我们存储通过非最大抑制使的盒子
    while boxes_left.shape[0] > 0:
        maximum_index = np.argmax(boxes_left[:, 0])  # ...获取具有最高置信度的下一个框的索引...
        maximum_box = np.copy(boxes_left[maximum_index])  # ...复制此包围框...
        maxima.append(maximum_box)
        boxes_left = np.delete(boxes_left, maximum_index,
                               axis=0)  # 从`boxes_left`移除这个包围框
        if boxes_left.shape[0] == 0: break
        similarities = iou(
            boxes_left[:, 1:],
            maximum_box[1:],
            #coords=coords,
            mode='element-wise',
            border_pixels=border_pixels)
        boxes_left = boxes_left[similarities <= iou_threshold]
    return np.array(maxima)
Beispiel #5
0
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
    '''
    进行贪心 non-maximum suppression.

    选择最高概率的边界框, 去处和此边界框的IoU较大的其它框. 在余下的框中再选最高概率的框,
    去处与其IoU较大的其它框. 重复, 直到所有的框之间都没有太大的IoU.

    Arguments:
        y_pred_decoded (list): 已经解码的预测值. 如果batch_size为 `n`, 这个list包含
            `n` 个 2 维 Numpy 数组. 如果一个图像有 `k` 个预测的边界框, 这个2 维 Numpy
            数组的形状为 `(k, 6)`, 这个数组的每一行的值为 `[class_id, score, xmin, xmax, ymin, ymax]`.
        iou_threshold (float, optional): 所有与具有最大概率的边界框重合度大于 `iou_threshold`
            都会被筛除
        coords (str, optional): `y_pred_decoded` 的坐标格式.
        border_pixels (str, optional): 如何处理位于边界框边界的像素. 其值可以为 'include',
            'exclude', or 'half'.

    Returns:
        去除了 non-maxima 之后的结果. 输出的坐标格式和输入一样
    '''
    y_pred_decoded_nms = []
    for batch_item in y_pred_decoded:  # 对每一幅图 ...
        boxes_left = np.copy(batch_item)
        maxima = []  # 用于保存通过 non-maximum suppression 后保留的边界框
        while boxes_left.shape[0] > 0:  # 还有余下的边界框...
            maximum_index = np.argmax(boxes_left[:, 1])  # ...得到下一个具有最大概率的边界框...
            maximum_box = np.copy(boxes_left[maximum_index])  # ...拷贝...
            maxima.append(maximum_box)  # ...将 `maximum_box` 放入 `maxima`, 因为一定会保留
            boxes_left = np.delete(boxes_left, maximum_index, axis=0)  # 将 `maximum_box` 从 `boxes_left` 中删除
            if boxes_left.shape[0] == 0: break  # 如果没有余下的边界框了, break. 否则...
            similarities = iou(boxes_left[:, 2:], maximum_box[2:], coords=coords, mode='element-wise',
                               border_pixels=border_pixels)  # ...比较IoU...
            boxes_left = boxes_left[similarities <= iou_threshold]  # ...去除与 `maximum box` 较大的边界框
        y_pred_decoded_nms.append(np.array(maxima))

    return y_pred_decoded_nms
Beispiel #6
0
    def __call__(self, ground_truth_labels, diagnostics=False):
        '''
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        '''

        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)

        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.

        y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors

        for i in range(batch_size): # For each batch item...

            if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
                raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
                                         "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
                                         "bounding boxes will lead to NaN errors during the training.")

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
                labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
            y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
            y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
Beispiel #7
0
def greedy_nms(y_pred_decoded,
               iou_threshold=0.45,
               coords='corners',
               border_pixels='half'):
    '''
    Perform greedy non-maximum suppression on the input boxes.

    Greedy NMS works by selecting the box with the highest score and
    removing all boxes around it that are too close to it measured by IoU-similarity.
    Out of the boxes that are left over, once again the one with the highest
    score is selected and so on, until no boxes with too much overlap are left.

    Arguments:
        y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
            is a list of length `n` where each list element is a 2D Numpy array.
            For a batch item with `k` predicted boxes this 2D Numpy array has
            shape `(k, 6)`, where each row contains the coordinates of the respective
            box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
            Technically, the number of columns doesn't have to be 6, it can be
            arbitrary as long as the first four elements of each row are
            `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
            is the score assigned to the prediction. Note that this function is
            agnostic to the scale of the score or what it represents.
        iou_threshold (float, optional): All boxes with a Jaccard similarity of
            greater than `iou_threshold` with a locally maximal box will be removed
            from the set of predictions, where 'maximal' refers to the box score.
        coords (str, optional): The coordinate format of `y_pred_decoded`.
            Can be one of the formats supported by `iou()`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        The predictions after removing non-maxima. The format is the same as the input format.
    '''
    y_pred_decoded_nms = []
    for batch_item in y_pred_decoded:  # For the labels of each batch item...
        boxes_left = np.copy(batch_item)
        maxima = [
        ]  # This is where we store the boxes that make it through the non-maximum suppression
        while boxes_left.shape[
                0] > 0:  # While there are still boxes left to compare...
            maximum_index = np.argmax(
                boxes_left[:, 1]
            )  # ...get the index of the next box with the highest confidence...
            maximum_box = np.copy(
                boxes_left[maximum_index])  # ...copy that box and...
            maxima.append(
                maximum_box
            )  # ...append it to `maxima` because we'll definitely keep it
            boxes_left = np.delete(
                boxes_left, maximum_index,
                axis=0)  # Now remove the maximum box from `boxes_left`
            if boxes_left.shape[0] == 0:
                break  # If there are no boxes left after this step, break. Otherwise...
            similarities = iou(
                boxes_left[:, 2:],
                maximum_box[2:],
                coords=coords,
                mode='element-wise',
                border_pixels=border_pixels
            )  # ...compare (IoU) the other left over boxes to the maximum box...
            boxes_left = boxes_left[
                similarities <=
                iou_threshold]  # ...so that we can remove the ones that overlap too much with the maximum box
        y_pred_decoded_nms.append(np.array(maxima))

    return y_pred_decoded_nms
    def __call__(self, ground_truth_labels, diagnostics=False):
        # 映射以定义哪些索引代表真实情况中的哪个坐标。
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # 为y_encoded生成模板。
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # 匹配真实包围框到锚点框
        ##################################################################################
        y_encoded[:, :, self.background_id] = 1  # 所有包围框默认为背景
        class_vectors = np.eye(self.n_classes)  # 用one-hot类向量来定义矩阵

        for i in range(batch_size):
            if ground_truth_labels[i].size == 0: continue
            labels = ground_truth_labels[i].astype(np.float)

            # 如果规范框坐标。
            if self.normalize_coords:
                labels[:, [ymin, ymax]] /= self.img_height
                labels[:, [xmin, xmax]] /= self.img_width

            # 也许转换包围框坐标格式。
            labels = convert_coordinates(labels,
                                         start_index=xmin,
                                         conversion='corners2centroids')

            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)]
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1)

            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8])

            # 对于每个真相框,获取与之最匹配的锚框。
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            similarities[:, bipartite_matches] = 0

            if self.matching_type == 'multi':  #进行多级匹配
                # 获取所有满足的匹配
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                similarities[:, matches[1]] = 0

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # 将框坐标转换为锚框偏移量。
        ##################################################################################
        y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
            -8, -7
        ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
        y_encoded[:, :, [
            -12, -11
        ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
            -4, -3
        ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
        y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
            -6, -5
        ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
        y_encoded[:, :, [-10, -9]] = np.log(
            y_encoded[:, :, [-10, -9]]
        ) / y_encoded[:, :, [
            -2, -1
        ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)

        if diagnostics:
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:-8] = 0
            return y_encoded, y_matched_anchors
        else:
            return y_encoded  #[batch_sizes, total_boxes, [one_hot_label, [4个值为真实包围框], [4个值为锚点框] ,[4个值为var]]]
Beispiel #9
0
    def __call__(self, labels, image_height=None, image_width=None):
        '''
        Arguments:
            labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where
                `m` is the number of bounding boxes and `n` is the number of elements that defines
                each bounding box (box coordinates, class ID, etc.). The box coordinates are expected
                to be in the image's coordinate system.
            image_height (int): Only relevant if `check_overlap == True`. The height of the image
                (in pixels) to compare the box coordinates to.
            image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare
                the box coordinates to.

        Returns:
            An array containing the labels of all boxes that are valid.
        '''

        labels = np.copy(labels)

        xmin = self.labels_format['xmin']
        ymin = self.labels_format['ymin']
        xmax = self.labels_format['xmax']
        ymax = self.labels_format['ymax']

        # Record the boxes that pass all checks here.
        requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool)

        if self.check_degenerate:

            non_degenerate = (labels[:, xmax] > labels[:, xmin]) * (
                labels[:, ymax] > labels[:, ymin])
            requirements_met *= non_degenerate

        if self.check_min_area:

            min_area_met = (labels[:, xmax] - labels[:, xmin]) * (
                labels[:, ymax] - labels[:, ymin]) >= self.min_area
            requirements_met *= min_area_met

        if self.check_overlap:

            # Get the lower and upper bounds.
            if isinstance(self.overlap_bounds, BoundGenerator):
                lower, upper = self.overlap_bounds()
            else:
                lower, upper = self.overlap_bounds

            # Compute which boxes are valid.

            if self.overlap_criterion == 'iou':
                # Compute the patch coordinates.
                image_coords = np.array([0, 0, image_width, image_height])
                # Compute the IoU between the patch and all of the ground truth boxes.
                image_boxes_iou = iou(image_coords,
                                      labels[:, [xmin, ymin, xmax, ymax]],
                                      coords='corners',
                                      mode='element-wise',
                                      border_pixels=self.border_pixels)
                requirements_met *= (image_boxes_iou >
                                     lower) * (image_boxes_iou <= upper)

            elif self.overlap_criterion == 'area':
                if self.border_pixels == 'half':
                    d = 0
                elif self.border_pixels == 'include':
                    d = 1  # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
                elif self.border_pixels == 'exclude':
                    d = -1  # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
                # Compute the areas of the boxes.
                box_areas = (labels[:, xmax] - labels[:, xmin] +
                             d) * (labels[:, ymax] - labels[:, ymin] + d)
                # Compute the intersection area between the patch and all of the ground truth boxes.
                clipped_boxes = np.copy(labels)
                clipped_boxes[:,
                              [ymin, ymax]] = np.clip(labels[:, [ymin, ymax]],
                                                      a_min=0,
                                                      a_max=image_height - 1)
                clipped_boxes[:, [xmin, xmax]] = np.clip(labels[:,
                                                                [xmin, xmax]],
                                                         a_min=0,
                                                         a_max=image_width - 1)
                intersection_areas = (
                    clipped_boxes[:, xmax] - clipped_boxes[:, xmin] + d) * (
                        clipped_boxes[:, ymax] - clipped_boxes[:, ymin] + d
                    )  # +1 because the border pixels belong to the box areas.
                # Check which boxes meet the overlap requirements.
                if lower == 0.0:
                    mask_lower = intersection_areas > lower * box_areas  # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign.
                else:
                    mask_lower = intersection_areas >= lower * box_areas  # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all.
                mask_upper = intersection_areas <= upper * box_areas
                requirements_met *= mask_lower * mask_upper

            elif self.overlap_criterion == 'center_point':
                # Compute the center points of the boxes.
                cy = (labels[:, ymin] + labels[:, ymax]) / 2
                cx = (labels[:, xmin] + labels[:, xmax]) / 2
                # Check which of the boxes have center points within the cropped patch remove those that don't.
                requirements_met *= (cy >= 0.0) * (cy <= image_height - 1) * (
                    cx >= 0.0) * (cx <= image_width - 1)

        return labels[requirements_met]