def __call__(self, ground_truth_labels, diagnostics=False):
        # 映射以定义哪些索引代表真实情况中的哪个坐标。
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # 为y_encoded生成模板。
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # 匹配真实包围框到锚点框
        ##################################################################################
        y_encoded[:, :, self.background_id] = 1  # 所有包围框默认为背景
        class_vectors = np.eye(self.n_classes)  # 用one-hot类向量来定义矩阵

        for i in range(batch_size):
            if ground_truth_labels[i].size == 0: continue
            labels = ground_truth_labels[i].astype(np.float)

            # 如果规范框坐标。
            if self.normalize_coords:
                labels[:, [ymin, ymax]] /= self.img_height
                labels[:, [xmin, xmax]] /= self.img_width

            # 也许转换包围框坐标格式。
            labels = convert_coordinates(labels,
                                         start_index=xmin,
                                         conversion='corners2centroids')

            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)]
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1)

            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8])

            # 对于每个真相框,获取与之最匹配的锚框。
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            similarities[:, bipartite_matches] = 0

            if self.matching_type == 'multi':  #进行多级匹配
                # 获取所有满足的匹配
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                similarities[:, matches[1]] = 0

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # 将框坐标转换为锚框偏移量。
        ##################################################################################
        y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
            -8, -7
        ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
        y_encoded[:, :, [
            -12, -11
        ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
            -4, -3
        ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
        y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
            -6, -5
        ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
        y_encoded[:, :, [-10, -9]] = np.log(
            y_encoded[:, :, [-10, -9]]
        ) / y_encoded[:, :, [
            -2, -1
        ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)

        if diagnostics:
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:-8] = 0
            return y_encoded, y_matched_anchors
        else:
            return y_encoded  #[batch_sizes, total_boxes, [one_hot_label, [4个值为真实包围框], [4个值为锚点框] ,[4个值为var]]]
    def __call__(self, ground_truth_labels, diagnostics=False):
        '''
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        '''

        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.

        y_encoded[:, :, self.
                  background_id] = 1  # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[
            1]  # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(
            self.n_classes
        )  # An identity matrix that we'll use as one-hot class vectors

        for i in range(batch_size):  # For each batch item...

            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:, [
                    ymin, ymax
                ]] /= self.img_height  # Normalize ymin and ymax relative to the image height
                labels[:, [
                    xmin, xmax
                ]] /= self.img_width  # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:
                              -8] = 0  # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
Example #3
0
    def __call__(self, ground_truth_labels):
        '''
        将真实数据转换成训练需要的格式
        参数:ground_truth_labels (list):(class_id, xmin, ymin, xmax, ymax)
        返回:y_encoded, (batch_size, #boxes, #classes + 4 + 4 + 4)
        '''

        #1. 真实标签顺序
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        # 整理anchor box的格式(batch_size, #boxes, #classes + 12)
        y_encoded = self.generate_encoding_template(batch_size=batch_size)

        # 匹配真实box和anchor box
        y_encoded[:, :, self.background_id] = 1  # 所有boxes默认为背景.
        n_boxes = y_encoded.shape[1]
        class_vectors = np.eye(self.n_classes)  #  one-hot class vectors

        for i in range(batch_size):  # For each batch item...
            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # normalize
            if self.normalize_coords:
                labels[:, [ymin, ymax]] /= self.img_height
                labels[:, [xmin, xmax]] /= self.img_width

            # 可能需要转换坐标格式
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            #  计算IoU  `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # 1. 找到和每个真实框IOU最高的一个default box,这里保证了每一个真实框将至少匹配到一个default box.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)
            # 将真实标签写入匹配到的default boxes中
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot
            # 将匹配到的default box设为0,表示已经匹配
            similarities[:, bipartite_matches] = 0

            #2. 剩余的default box会寻找与其IOU最大的真实框,如果IOU大于阈值pos_iou_threshold,匹配成功

            if self.matching_type == 'multi':
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
                similarities[:, matches[1]] = 0

            # 最后: 剩下的框中如果有IOU大于neg_iou_limit,设置为中立,因为和真实框比较接近,不适合作为背景类参与训练
            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        # 2.将坐标转换成偏移值
        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        return y_encoded