コード例 #1
0
def preview_anchor_boxes():
    for _, _, original_images, original_labels in train_generator:
        for i in range(len(original_images)):
            original_image = original_images[i]
            original_label = original_labels[i]
            anchor_boxes = batch_item_y[:, -8:-4]
            anchor_boxes = convert_coordinates(anchor_boxes,
                                               start_index=0,
                                               conversion='centroids2corners',
                                               border_pixels='half')
            anchor_boxes[:, [0, 2]] *= img_width
            anchor_boxes[:, [1, 3]] *= img_height
            anchor_boxes = np.round(anchor_boxes).astype('int')
            print(anchor_boxes[0])
            image = batch_item_x.astype('int8')
            for anchor_box in anchor_boxes:
                cv2.rectangle(image, (anchor_box[0], anchor_box[1]), (anchor_box[2], anchor_box[3]), (0, 255, 0), 2)
            cv2.namedWindow('image', cv2.WINDOW_NORMAL)
            cv2.imshow('image', image)
            cv2.waitKey(0)
            pass
コード例 #2
0
    def __call__(self, ground_truth_labels, diagnostics=False):
        '''
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        '''

        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.

        y_encoded[:, :, self.
                  background_id] = 1  # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[
            1]  # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(
            self.n_classes
        )  # An identity matrix that we'll use as one-hot class vectors

        for i in range(batch_size):  # For each batch item...

            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:, [
                    ymin, ymax
                ]] /= self.img_height  # Normalize ymin and ymax relative to the image height
                labels[:, [
                    xmin, xmax
                ]] /= self.img_width  # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:
                              -8] = 0  # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
コード例 #3
0
    def call(self, x, mask=None):
        """
        Return an anchor box tensor based on the shape of the input tensor.
        The logic implemented here is identical to the logic of function `generate_anchor_boxes_for_layer` in the module
        `ssd_box_encode_decode_utils.py`.
        Note that this tensor does not participate in any graph computations at runtime.
        It is being created as a constant once during graph creation and is just being output along with the rest of the
        model output during runtime.
        Because of this, all logic is implemented as Numpy array operations and it is sufficient to convert the
        resulting Numpy array into a Keras tensor at the very end before outputting it.

        Arguments:
            x (tensor): 4D tensor of shape
                `(batch, channels, height, width)` if `dim_ordering = 'th'`
                or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
                The input for this layer must be the output of the localization predictor layer.
            # UNCLEAR mask 是啥?
            mask:
        """

        # Compute box width and height for each aspect ratio
        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for aspect_ratio in self.aspect_ratios:
            if aspect_ratio == 1:
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    # NOTE 几何平均数, 就是当 aspect_ratios 为 1 时取两个 boxes
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(aspect_ratio)
                box_width = self.this_scale * size * np.sqrt(aspect_ratio)
                wh_list.append((box_width, box_height))
        # shape 为 (n_boxes, 2)
        wh_list = np.array(wh_list)

        # We need the shape of the input tensor
        if K.image_dim_ordering() == 'tf':
            # FIXME
            batch_size, feature_map_height, feature_map_width, feature_map_channels = K.int_shape(
                x)
            # batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
        else:
            # Not yet relevant since TensorFlow is the only supported backend right now,
            # but it can't harm to have this in here for the future
            batch_size, feature_map_height, feature_map_width, feature_map_channels = K.int_shape(
                x)
            # batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape

        ##################################################################################
        # Compute the grid of box center points. They are identical for all aspect ratios.
        ##################################################################################

        # 1. Compute the step sizes,
        # i.e. how far apart the anchor box center points will be vertically and horizontally.
        if self.this_steps is None:
            # 假设 box4, img_height,img_width=512, 那么 feature_map_height,feature_map_width=512 / 2 ^ 3 = 64
            # 那么 step_height,step_width = 512 / 64 = 8
            # 意思是 feature_map 是 64*64 的方格, 一个方格表示原图的 8*8 个像素, 每一个 step 移动一个方格
            step_height = self.img_height / feature_map_height
            step_width = self.img_width / feature_map_width
        else:
            if isinstance(self.this_steps, (list, tuple)):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            # 相当于 elif isinstance(self.this_steps, (int, float)):
            else:
                step_height = self.this_steps
                step_width = self.this_steps

        # 2. Compute the offsets, i.e.
        # at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if self.this_offsets is None:
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets, (list, tuple)):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            # 相当于 elif isinstance(self.this_offsets, (int, float)):
            else:
                offset_height = self.this_offsets
                offset_width = self.this_offsets

        # 3. Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        # np.linspace 参见 https://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html
        # 第一个参数 start 表示区间开始, 第二个参数 stop 表示区间结尾, 第三个参数 num, 表示个数,默认包含 stop
        # 如 box4, np.linspace(0.5 * 8, 63.5 * 8, 64), cy=np.array([4, 12,..., 500, 508])
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_height - 1) *
                         step_height, feature_map_height)
        # 如 box4, np.linspace(0.5 * 8, 63.5 * 8, 64), cx=np.array([4, 12,..., 500, 508])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_width - 1) * step_width,
                         feature_map_width)
        # 如 box4, cx_grid=np.array([[4,12,...508],[4,12,...508],..., [4,12,...508]), shape 为 (64, 64)
        # cy_grid=np.array([[4,4,...4],[12,12,...12],...,[508,508,...508]]), shape 为 (64, 64)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        # This is necessary for np.tile() to do what we want further down
        # 如 box4, shape 变为 (64, 64, 1)
        cx_grid = np.expand_dims(cx_grid, -1)
        cy_grid = np.expand_dims(cy_grid, -1)

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))
        # np.tile() 返回的数组的 shape 为 (feature_map_height, feature_map_width, n_boxes)
        # Set cx
        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes))
        # Set cy
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes))
        # Set w
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]
        # Set h
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]

        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
        # 转换是为了做 clip
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            # 记得 tf 是不能做这样的操作的
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # TODO: Implement box limiting directly for `(cx, cy, w, h)`
        #  so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        # Create a tensor to contain the variances and append it to `boxes_tensor`.
        # This tensor has the same shape as `boxes_tensor`
        # and simply contains the same 4 variance values for every position in the last axis.
        # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor = np.zeros_like(boxes_tensor)
        # Long live broadcasting
        variances_tensor += self.variances
        # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)
        # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
        # 沿着 batch_size 那一维进行 tile
        # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #4
0
ファイル: AnchorBoxes.py プロジェクト: Eva05/ILFU
        def call(self, x, mask=None):

            #STAGE 1

            # Compute box width and height for each aspect ratio
            # The shorter side of the image will be used to compute width and height using {scale} and {aspect_ratios}

            size = min(self.img_height, self.img_width)

            # Compute the box widths and and heights for all aspect ratios
            wh_list = []

            for ar in self.aspect_ratios:

                if (ar == 1):

                    #Compute the regular anchor box for aspect ratio 1

                    box_height = box_width = self.this_scale * size
                    wh_list.append((box_width, box_height))

                    if self.two_boxes_for_ar1:
                        # Compute one slightly larger versionfor mode {two_boxes_for_ar1}
                        # using the geometric mean of this scale value and the next.
                        box_height = box_width = np.sqrt(
                            self.this_scale * self.naxt_scale) * size
                        wh_list.append((box_width, box_height))

                else:
                    box_height = self.this_scale * size / np.sqrt(ar)
                    box_width = self.this_scale * size * np.sqrt(ar)

            wh_list = np.array(wh_list)

            #STAGE 1 -END

            #STAGE 2

            #Compute the grid of box center points.They ara identical for all aspect ratios
            #Compute the step sizes, i.e how far apart the anchor box center points will
            #be vertically and horizontaly

            if (self.this_steps is None):

                step_height = self.img_height / feature_map_height
                step_width = self.img_width / feature_map_width

            else:
                if isinstance(self.this_steps,
                              (list, tuple)) and (len(self.this_steps) == 2):
                    step_height = self.this_steps[0]
                    step_width = self.thos_steps[1]

                elif isinstance(self.this_steps, (int, float)):

                    step_height = self.this_steps
                    step_width = self.this_steps

            #Compute the offsets , i.e. at what pixel values the first anchor box center point
            #will be from the top and from the left of the image

            if (self.this_offsets is None):

                offset_height = 0.5
                offset_width = 0.5
            else:

                if isinstance(self.this_offsets,
                              (list, tuple)) and (len(self.this_offsets) == 2):

                    offset_height = self.this_offsets[0]
                    offset_width = self.this_offsets[1]

                elif isinstance(self.this_offsets, (int, float)):

                    offset_height = self.this_offsets
                    offset_width = self.this_offset

            # STAGE 2 -END

            #STAGE 3

            #Now that we have the offsets and step sizes , compute the grid of anchor
            #box center points

            cy = np.linspace(offset_height * step_height,
                             (offset_height + feature_map_height - 1) *
                             step_height, feature_map_height)
            cx = np.linspace(offset_width * step_width,
                             (offset_width + feature_map_width - 1) *
                             step_width, feature_map_width)
            cx_grid, cy_grid = np.meshgrid(cx, cy)

            cx_grid = np.expand_dims(
                cx_grid, -1
            )  # This is necessary for np.tile() to do what we want further down
            cy_grid = np.expand_dims(
                cy_grid, -1
            )  # This is necessary for np.tile() to do what we want further down

            # Create a 4D tensor template of shape (feature_map_height, feature_map_width, n_boxes, 4)
            # where the last dimension will contain (cx, cy, w, h)

            boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                               (1, 1, self.n_boxes))  #Set cx
            boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                               (1, 1, self.n_boxes))  #Set cy
            boxes_tensor[:, :, :, 2] = wh_listp[:, 0]  #Set w
            boxes_tensor[:, :, :, 3] = wh_lsit[:, 1]  #Set h

            #Convert (cx,cy,w,h) to (xmin,xmax,ymin,ymax)

            boxes_tensor = conver_coordinates(boxes_tensor,
                                              start_index=0,
                                              conversion='centroids2corners')

            #if clip_boxes is enabled,clip the coordinates to lie within the image boundaries
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0

            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coordsp[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

            #if normalized_coords is enabled,normalize the coordinates to be within [0,1]
            if self.normalize_coords:
                boxes_tensor[:, :, :, [0, 2]] /= self.img_width
                boxes_tensor[:, :, :, [1, 3]] /= self.img_height

            # Implement box limiting directly for (cx, cy, w, h) so that we don't have to unnecessarily convert back and forth.
            if self.coords == 'centroids':
                # Convert `(xmin, ymin, xmax, ymax)` back to (cx, cy, w, h).
                boxes_tensor = convert_coordinates(
                    boxes_tensor,
                    start_index=0,
                    conversion='corners2centroids',
                    border_pixels='half')
            elif self.coords == 'minmax':
                # Convert `(xmin, ymin, xmax, ymax)` to (xmin, xmax, ymin, ymax).
                boxes_tensor = convert_coordinates(boxes_tensor,
                                                   start_index=0,
                                                   conversion='corners2minmax',
                                                   border_pixels='half')

            # Create a tensor to contain the variances and append it to boxes_tensor. This tensor has the same shape
            # as boxes_tensor and simply contains the same 4 variance values for every position in the last axis.

            variances_tensor = np.zeros_like(
                boxes_tensor
            )  # shape (feature_map_height,feature_map_width,n_boxes,8)
            variances_tensor += self.variances

            # Now boxes_tensor becomes a tensor of shape (feature_map_height, feature_map_width, n_boxes, 8)
            boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                          axis=-1)

            # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
            # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`

            boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
            boxes_tensor = tf.Variable(initial_value=boxes_tensor,
                                       dtype=tf.float32)
            boxes_tensor = tf.reshape(
                boxes_tensor,
                [-1, feature_map_height, feature_map_width, self.n_boxes, 8])

            #STAGE 3 -END

            return boxes_tensor
コード例 #5
0
ファイル: ssd_input_encoder.py プロジェクト: Asteur/ssd-keras
    def __call__(self, ground_truth_labels, diagnostics=False):
        '''
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        For each image in the batch, each ground truth bounding box belonging to that image will be compared against each
        anchor box in a template with respect to their jaccard similarity. If the jaccard similarity is greater than
        or equal to the set threshold, the boxes will be matched, meaning that the ground truth box coordinates and class
        will be written to the the specific position of the matched anchor box in the template.

        The class for all anchor boxes for which there was no match with any ground truth box will be set to the
        background class, except for those anchor boxes whose IoU similarity with any ground truth box is higher than
        the set negative upper bound (see the `neg_iou_limit` argument in `__init__()`).

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        '''

        # 1: Generate the template for y_encoded
        y_encode_template = self.generate_encode_template(
            batch_size=len(ground_truth_labels), diagnostics=False)
        y_encoded = np.copy(
            y_encode_template
        )  # We'll write the ground truth box data to this array

        # 2: Match the boxes from `ground_truth_labels` to the anchor boxes in `y_encode_template`
        #    and for each matched box record the ground truth coordinates in `y_encoded`.
        #    Every time there is no match for a anchor box, record `class_id` 0 in `y_encoded` for that anchor box.

        class_vector = np.eye(
            self.n_classes
        )  # An identity matrix that we'll use as one-hot class vectors

        for i in range(y_encode_template.shape[0]):  # For each batch item...
            available_boxes = np.ones(
                (y_encode_template.shape[1])
            )  # 1 for all anchor boxes that are not yet matched to a ground truth box, 0 otherwise
            negative_boxes = np.ones(
                (y_encode_template.shape[1]
                 ))  # 1 for all negative boxes, 0 otherwise
            for true_box in ground_truth_labels[
                    i]:  # For each ground truth box belonging to the current batch item...
                true_box = true_box.astype(np.float)
                if abs(true_box[3] - true_box[1] < 0.001) or abs(
                        true_box[4] - true_box[2] < 0.001):
                    continue  # Protect ourselves against bad ground truth data: boxes with width or height equal to zero
                if self.normalize_coords:
                    true_box[[
                        1, 3
                    ]] /= self.img_width  # Normalize xmin and xmax to be within [0,1]
                    true_box[[
                        2, 4
                    ]] /= self.img_height  # Normalize ymin and ymax to be within [0,1]
                if self.coords == 'centroids':
                    true_box = convert_coordinates(
                        true_box,
                        start_index=1,
                        conversion='corners2centroids')
                elif self.coords == 'minmax':
                    true_box = convert_coordinates(true_box,
                                                   start_index=1,
                                                   conversion='corners2minmax')
                similarities = iou(
                    y_encode_template[i, :, -12:-8],
                    true_box[1:],
                    coords=self.coords
                )  # The iou similarities for all anchor boxes
                negative_boxes[
                    similarities >= self.
                    neg_iou_limit] = 0  # If a negative box gets an IoU match >= `self.neg_iou_limit`, it's no longer a valid negative box
                similarities *= available_boxes  # Filter out anchor boxes which aren't available anymore (i.e. already matched to a different ground truth box)
                available_and_thresh_met = np.copy(similarities)
                available_and_thresh_met[
                    available_and_thresh_met < self.
                    pos_iou_threshold] = 0  # Filter out anchor boxes which don't meet the iou threshold
                assign_indices = np.nonzero(
                    available_and_thresh_met
                )[0]  # Get the indices of the left-over anchor boxes to which we want to assign this ground truth box
                if len(assign_indices) > 0:  # If we have any matches
                    y_encoded[i, assign_indices, :-8] = np.concatenate(
                        (class_vector[int(true_box[0])], true_box[1:]), axis=0
                    )  # Write the ground truth box coordinates and class to all assigned anchor box positions. Remember that the last four elements of `y_encoded` are just dummy entries.
                    available_boxes[
                        assign_indices] = 0  # Make the assigned anchor boxes unavailable for the next ground truth box
                else:  # If we don't have any matches
                    best_match_index = np.argmax(
                        similarities
                    )  # Get the index of the best iou match out of all available boxes
                    y_encoded[i, best_match_index, :-8] = np.concatenate(
                        (class_vector[int(true_box[0])], true_box[1:]), axis=0
                    )  # Write the ground truth box coordinates and class to the best match anchor box position
                    available_boxes[
                        best_match_index] = 0  # Make the assigned anchor box unavailable for the next ground truth box
                    negative_boxes[
                        best_match_index] = 0  # The assigned anchor box is no longer a negative box
            # Set the classes of all remaining available anchor boxes to class zero
            background_class_indices = np.nonzero(negative_boxes)[0]
            y_encoded[i, background_class_indices, 0] = 1

        # 3: Convert absolute box coordinates to offsets from the anchor boxes and normalize them
        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encode_template[:, :, [
                -12, -11
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [-12, -11]] /= y_encode_template[:, :, [
                -10, -9
            ]] * y_encode_template[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encode_template[:, :, [
                -10, -9
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encode_template[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encode_template[:, :, -12:
                                               -8]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encode_template[:, :, -10] - y_encode_template[:, :, -12],
                axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encode_template[:, :, -9] - y_encode_template[:, :, -11],
                axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encode_template[:, :,
                                               -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encode_template[:, :, -12:
                                               -8]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encode_template[:, :, -11] - y_encode_template[:, :, -12],
                axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encode_template[:, :, -9] - y_encode_template[:, :, -10],
                axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encode_template[:, :,
                                               -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:
                              -8] = 0  # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
コード例 #6
0
    def __call__(self, ground_truth_labels):
        '''
        将真实数据转换成训练需要的格式
        参数:ground_truth_labels (list):(class_id, xmin, ymin, xmax, ymax)
        返回:y_encoded, (batch_size, #boxes, #classes + 4 + 4 + 4)
        '''

        #1. 真实标签顺序
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        # 整理anchor box的格式(batch_size, #boxes, #classes + 12)
        y_encoded = self.generate_encoding_template(batch_size=batch_size)

        # 匹配真实box和anchor box
        y_encoded[:, :, self.background_id] = 1  # 所有boxes默认为背景.
        n_boxes = y_encoded.shape[1]
        class_vectors = np.eye(self.n_classes)  #  one-hot class vectors

        for i in range(batch_size):  # For each batch item...
            if ground_truth_labels[i].size == 0:
                continue  # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(
                np.float)  # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:, [xmax]] - labels[:, [xmin]] <= 0) or np.any(
                    labels[:, [ymax]] - labels[:, [ymin]] <= 0):
                raise DegenerateBoxError(
                    "SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, "
                    .format(i, labels) +
                    "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth "
                    +
                    "bounding boxes will lead to NaN errors during the training."
                )

            # normalize
            if self.normalize_coords:
                labels[:, [ymin, ymax]] /= self.img_height
                labels[:, [xmin, xmax]] /= self.img_width

            # 可能需要转换坐标格式
            if self.coords == 'centroids':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2centroids',
                                             border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels,
                                             start_index=xmin,
                                             conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(
                np.int
            )]  # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1
            )  # The one-hot version of the labels for this batch item

            #  计算IoU  `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8],
                               coords=self.coords,
                               mode='outer_product',
                               border_pixels=self.border_pixels)

            # 1. 找到和每个真实框IOU最高的一个default box,这里保证了每一个真实框将至少匹配到一个default box.
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)
            # 将真实标签写入匹配到的default boxes中
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot
            # 将匹配到的default box设为0,表示已经匹配
            similarities[:, bipartite_matches] = 0

            #2. 剩余的default box会寻找与其IOU最大的真实框,如果IOU大于阈值pos_iou_threshold,匹配成功

            if self.matching_type == 'multi':
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
                similarities[:, matches[1]] = 0

            # 最后: 剩下的框中如果有IOU大于neg_iou_limit,设置为中立,因为和真实框比较接近,不适合作为背景类参与训练
            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        # 2.将坐标转换成偏移值
        if self.coords == 'centroids':
            y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
                -8, -7
            ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:, :, [
                -12, -11
            ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
                -4, -3
            ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
                -6, -5
            ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:, :, [-10, -9]] = np.log(
                y_encoded[:, :, [-10, -9]]
            ) / y_encoded[:, :, [
                -2, -1
            ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -10]] /= np.expand_dims(
                y_encoded[:, :, -6] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-11, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -7], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:, :, -12:
                      -8] -= y_encoded[:, :, -8:
                                       -4]  # (gt - anchor) for all four coordinates
            y_encoded[:, :, [-12, -11]] /= np.expand_dims(
                y_encoded[:, :, -7] - y_encoded[:, :, -8], axis=-1
            )  # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:, :, [-10, -9]] /= np.expand_dims(
                y_encoded[:, :, -5] - y_encoded[:, :, -6], axis=-1
            )  # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:, :, -12:
                      -8] /= y_encoded[:, :,
                                       -4:]  # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        return y_encoded
コード例 #7
0
ファイル: ssd_output_decoder.py プロジェクト: jbutle55/ssd
def decode_detections_debug(y_pred,
                            confidence_thresh=0.01,
                            iou_threshold=0.45,
                            top_k=200,
                            input_coords='centroids',
                            normalize_coords=True,
                            img_height=None,
                            img_width=None,
                            variance_encoded_in_target=False,
                            border_pixels='half'):
    '''
    This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
    predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.

    That is, in addition to the usual data, each predicted box has the internal index of that box within
    the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
    box prediction; in particular, it allows you to know which predictor layer made a given prediction.
    This can be useful for debugging.

    Arguments:
        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
            boxes predicted by the model per image and the last axis contains
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
            coordinates. Requires `img_height` and `img_width` if set to `True`.
        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
        a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        if variance_encoded_in_target:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
        else:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates

    # 3: For each batch item, prepend each box's internal index to its coordinates.

    y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
    y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
    y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
    y_pred_decoded_raw = y_pred_decoded_raw2

    # 4: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index

    y_pred_decoded = [] # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = [] # Store the final predictions for this batch item here
        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
            threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
                maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
                maxima_output[:,1] = class_id # ...and write the class ID to the second column...
                maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        pred = np.concatenate(pred, axis=0)
        if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
            top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
            pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
コード例 #8
0
ファイル: ssd_output_decoder.py プロジェクト: jbutle55/ssd
def decode_detections(y_pred,
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      input_coords='centroids',
                      normalize_coords=True,
                      img_height=None,
                      img_width=None,
                      border_pixels='half'):
    '''
    Convert model prediction output back to a format that contains only the positive box predictions
    (i.e. the same format that `SSDInputEncoder` takes as input).

    After the decoding, two stages of prediction filtering are performed for each class individually:
    First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
    classes are concatenated and the `top_k` overall highest confidence results constitute the final
    predictions for a given batch item. This procedure follows the original Caffe implementation.
    For a slightly different and more efficient alternative to decode raw model output that performs
    non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.

    Arguments:
        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
            boxes predicted by the model per image and the last axis contains
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
            coordinates. Requires `img_height` and `img_width` if set to `True`.
        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
        a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates

    # 3: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates

    print('Number of classes: ', n_classes)

    y_pred_decoded = [] # Store the final predictions in this list
    print('Final pred decoded: ', y_pred_decoded)
    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = [] # Store the final predictions for this batch item here
        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
            print('Single Class - {}: {}'.format(class_id, single_class[:,0]))
            threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
                maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,0] = class_id # Write the class ID to the first column...
                maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        if pred: # If there are any predictions left after confidence-thresholding...
            pred = np.concatenate(pred, axis=0)
            if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
                top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
                pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
        else:
            pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
コード例 #9
0
def decode_detections(y_pred,
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      input_coords='centroids',
                      normalize_coords=True,
                      img_height=None,
                      img_width=None,
                      border_pixels='half'):
    '''
    将模型预测输出转换成只包含 positive box predictions
    (i.e. the same format that `SSDInputEncoder` takes as input).

    1.confidence thresholding, 
    2.greedy non-maximum suppression. 
    3.`top_k` overall highest confidence results 

    Arguments:
        y_pred (array): `(batch_size, #boxes, #classes + 4 + 4 + 4)`, #boxes是box的数量,#classes是类别的数量
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): [0,1)之间的一个浮点数, 过滤掉小于这个分数的box
        iou_threshold (float, optional):  [0,1]之间的一个浮点数. 从给定类的预测集中移除与局部最大框的Jaccard相似性的大于
            “iou_threshold”的框,其中“最大”指的是框得分。
        top_k (int, optional): 在非最大抑制阶段之后为每个batch保留的最高评分预测的数量。
        input_coords (str, optional): 模型输出的坐标格式:'centroids' `(cx, cy, w, h)` , 'minmax' `(xmin, xmax, ymin, ymax)`,
            或 'corners' `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): 如果模型输出的是相对坐标,并且你想把这些相对坐标转回绝对坐标,设置为`True` 
        img_height (int, optional): 输入图像的高. 只有 `normalize_coords` 是 `True`时需要这个值.
        img_width (int, optional): 输入图像的宽. 只有 `normalize_coords` 是 `True`时需要这个值.
        border_pixels (str, optional): 边界像素 'include', 'exclude', or 'half'. 
    Returns:
        长度为 `batch_size` 的list,每个list包含一张图像的预测的boxes,其中的boxes是 Numpy array , shape `(boxes, 6)`  
            `[class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")

    # 2: normalized box coordinates -> absolute coordinates

    if normalize_coords:
        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates

    # 3: 移除分数低于某个阈值的box,以及使用nms处理每一类的box
    n_classes = y_pred_decoded_raw.shape[-1] - 4 # 类别的数量是最后一维减去四个坐标
    y_pred_decoded = [] # 在这个列表中存储最后的预测结果
    for batch_item in y_pred_decoded_raw: # `batch_item` 的shape为 `[n_boxes, n_classes + 4 coords]`
        pred = [] # 这这里存储这个batch项的预测结构
        for class_id in range(1, n_classes): # 除了背景类的每一个类别 (背景类的ID是0)...
            single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
            threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
                maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,0] = class_id # Write the class ID to the first column...
                maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
        # 只保留具有最高分数的`top_k`最大值
        if pred: # 如果在confidence-thresholding之后还留着任何预测
            pred = np.concatenate(pred, axis=0)
            if top_k != 'all' and pred.shape[0] > top_k: # 如果剩余数量多余`top_k`个,才会继续过滤.
                top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # 得到`top_k`个最高分数的索引
                pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
        else:
            pred = np.array(pred) # 就算是None, `pred` 也必须是 Numpy array.
        y_pred_decoded.append(pred) # 现在我们做完了,把最后预测结果附在输出列表中

    return y_pred_decoded
コード例 #10
0
    def call(self, x, mask=None):

        # 计算每个横纵比的box 宽高
        size = min(self.img_height, self.img_width)
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # 使用这个值和下一个值两者的几何平均值 来计算一个稍大的scale。
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        if K.image_dim_ordering() == 'tf':
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
        else:
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape

        # 计算box网格中心点,它们对于所有纵横比都是相同的。

        # 计算步长
        if (self.this_steps is None):
            step_height = self.img_height / feature_map_height
            step_width = self.img_width / feature_map_width
        else:
            if isinstance(self.this_steps,
                          (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # 计算 offsets
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets,
                          (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        # 计算中心点坐标
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_height - 1) *
                         step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_width - 1) * step_width,
                         feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1)
        cy_grid = np.expand_dims(cy_grid, -1)

        # 组织返回值:(feature_map_height, feature_map_width, n_boxes, 4)

        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))

        # 最后一维包含:(cx, cy, w, h)`
        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes))  # cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes))  # cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # h

        # 转换坐标: (cx, cy, w, h) -》(xmin, xmax, ymin, ymax)
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # 将坐标限制在image size内
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # 归一化
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # 转换坐标
        if self.coords == 'centroids':
            # 转回 (xmin, ymin, xmax, ymax) -》 (cx, cy, w, h)
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # 转换 (xmin, ymin, xmax, ymax) -》 (xmin, xmax, ymin, ymax)
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        # 含方差张量,该张量与 boxes_tensor 形状相同
        #形状为:(feature_map_height, feature_map_width, n_boxes, 4)
        variances_tensor = np.zeros_like(boxes_tensor)
        variances_tensor += self.variances

        # 此时 boxes_tensor为:(feature_map_height, feature_map_width, n_boxes, 8)
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # 结果:(batch_size, feature_map_height, feature_map_width, n_boxes, 8)
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #11
0
    def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps=None,
                                        this_offsets=None,
                                        diagnostics=False):
        '''
        Tinh toan mot array cac vi tri va kich thuoc ko gian cua cac anchor box cho 1 lop
        predictor layer co kich thuoc `feature_map_size == [feature_map_height, feature_map_width]`.

        Cac doi so:
            * feature_map_size (tuple):
                - Mot list hoac tuple `[feature_map_height, feature_map_width]`
                  voi cac kich thuoc ko gian ko gian ban do tinh nang de tao cac anchor box
            * aspect_ratios (list):
                - Mot danh sach cac so float, chua cac ty le khung hinh ratios ma cac anchor
                  box se duoc tao. Tat ca cac phan tu nay phai mang gia tri duy nhat.
            * this_scale (float):
                - Mot so float trong [0, 1], day la he so ty le hien tai de tao ra anchor box
            * next_scale (float): 
                - Mot float trong [0, 1], he so ty le lon hon tiep theo. chi lien quan den
                  truong hop `self.two_boxes_for_ar1 == True`.
            * diagnostics (bool, optional): Neu true, cac dau ra bo sung sau se duoc tra ve:
                1) Mot danh sach cac toa do center point `x` va `y` for each spatial location cho tung vi tri ko gian.
                2) Mot danh sach gom`(width, height)` cho moi box tuong ung voi aspect ratios
                3) Mot tuple chua `(step_height, step_width)`
                4) Mot tuple chua `(offset_height, offset_width)`
                Thong tin nay co the huu ich de chi hieu trong mot vai con so, tuc la cai
                ma cac luoi anchor box duoc ta ra trong nhu the nao, nghia la cac box khac nhau
                lon nhu the nao, va phan bo khong gian cua chung day dac nhu the nao,
                de xem luoi cac box co bao phu duoc duoc cac doi tuong trong hinh anh dau
                vao mot cach thich hop hay khong.

        Returns:
            Mot tensor numpy 4D voi kich thuoc
            `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` trong do kich thuoc
            cuoi cung chua xmin, xmax, ymin, ymax cho moi anchor box trong moi o cua ban do 
            tinh nang.
        '''
        # Tinh chieu rong va chieu cao cua cac box cho tung ty le khung hinh aspect ratio

        # mat ngan hon cua hinh anh se duoc su dung de tinh toan w va h bang cach su dung
        # scale va aspect ratios

        # lay canh ngan hon tu hinh anh dau vao
        size = min(self.img_height, self.img_width)

        # Tinh toan chieu rong va chieu cao cho cac box cho tat ca cac aspect ratios
        wh_list = []

        # Lap qua cac ty le khung hinh trong danh sach aspect ratio
        for ar in aspect_ratios:

            # Neu aspect ratios = 1
            if (ar == 1):
                # Tinh toan cac anchor box voi aspect ratio = 1
                # bang cach su dung scale cua predictor layers hien tai nhan size
                # (chieu nho hon trong hai chieu rong va cao cua hinh anh dau vao)
                box_height = box_width = this_scale * size
                # them chieu rong va chieu cao cua box vao trong mang quan ly wh_list
                wh_list.append((box_width, box_height))

                # neu cho phep tao 2 anchor box voi aspect ratios = 1
                if self.two_boxes_for_ar1:
                    # Tinh toan mot phien ban lon hon mot chut bang cach su dung
                    # gia tri trung binh hinh hoc cua gia tri scale cua lop predict
                    # layer hien tai nhan voi scale cua lop predictor layer tiep theo
                    # nhan voi canh nho hon (trong hai canh rong va cao) cua hinh anh dau vao
                    box_height = box_width = np.sqrt(
                        this_scale * next_scale) * size
                    # them chieu rong va chieu cao cua box vao trong mang quan ly wh_list
                    wh_list.append((box_width, box_height))
            else:
                # Neu aspect ratios ko bang gia tri 1 thi tinh mot cach binh thuong chieu cao
                # va chieu rong cua cac anchor box
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))

        # Chuyen wh_list ve dang numpy array
        wh_list = np.array(wh_list)
        # So luong cac box
        n_boxes = len(wh_list)

        # Tinh toan luoi cua cac box center points. Chung giong het nhau cho tat ca cac
        # ty le khung hinh

        # Tinh toan step size, tuc la cac diem center cua cac anchor box cach nhau bao xa
        # va theo chieu ngang
        if (this_steps is None):
            # step theo chieu doc
            step_height = self.img_height / feature_map_size[0]
            # step theo chieu ngang
            step_width = self.img_width / feature_map_size[1]
        else:
            # neu this_steps co chua list hoac tuple va kich thuoc cua this_steps la 2
            if isinstance(this_steps,
                          (list, tuple)) and (len(this_steps) == 2):
                # step theo chieu doc la this_step[0]
                step_height = this_steps[0]
                # step theo chieu ngang la this_step[1]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps

        # Tinh toan cac do lech (offset), tuc la o cac gia tri pixel nao, diem trung tam
        # cua anchor box dau tien se o tren cung va tu ben trai cua hinh anh

        # neu this_offsets la None
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets,
                          (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets

        # bay h chung ta da co offset va step sizes, tinh toan luoi toa do cac center point cua
        # cac anchor box
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_size[0] - 1) *
                         step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_size[1] - 1) * step_width,
                         feature_map_size[1])
        # vi du cx=5, cy=6 => np.meshgrid(cx, cy): [array([[5]]), array([[6]])]
        cx_grid, cy_grid = np.meshgrid(cx, cy)

        # Điều này là cần thiết để np.tile () thực hiện những gì chúng tôi muốn tiếp tục
        cx_grid = np.expand_dims(cx_grid, -1)
        # Điều này là cần thiết để np.tile () thực hiện những gì chúng tôi muốn tiếp tục
        cy_grid = np.expand_dims(cy_grid, -1)

        # Tao mot template tensor 4D co hinh dang `(feature_map_height, feature_map_width, n_boxes, 4)`
        # trong do kich thuoc cuoi cung se chua `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                           (1, 1, n_boxes))  # Thiet lap cho cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                           (1, 1, n_boxes))  # Thiet lap cho cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Thiet lap cho w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Thiet lap cho h

        # chuyen doi tu dinh dang `(cx, cy, w, h)` thanh dinh dang `(xmin, ymin, xmax, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # Neu clip_boxes duoc bat hay cat toa do de cho cac bbx nam trong ranh gioi cua hinh anh
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # Neu normalize_coords duoc bat thi hay chuan hoa toa do cua cac bbx nam trong doan [0, 1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # TODO: Trien khai cac hop gioi han truc tiep cho cx, cy, w, h de chung ta ko phai chuyen
        # doi qua lai mot cach khong can thiet
        if self.coords == 'centroids':
            # Chuyen doi `(xmin, ymin, xmax, ymax)` ve dang `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Chuyen doi dinh dang `(xmin, ymin, xmax, ymax)` ve dang `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        if diagnostics:
            return boxes_tensor, (cy,
                                  cx), wh_list, (step_height,
                                                 step_width), (offset_height,
                                                               offset_width)
        else:
            return boxes_tensor
コード例 #12
0
def decode_detections_debug(y_pred,
                            confidence_thresh=0.01,
                            iou_threshold=0.45,
                            top_k=200,
                            input_coords='centroids',
                            normalize_coords=True,
                            img_height=None,
                            img_width=None,
                            variance_encoded_in_target=False,
                            border_pixels='half'):

    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`"
            .format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(
        y_pred[:, :, :-8]
    )  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        if variance_encoded_in_target:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [
                -4, -3
            ]] * y_pred[:, :, [-6, -5]] + y_pred[:, :, [-8, -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                y_pred_decoded_raw[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        else:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [
                -4, -3
            ]] * y_pred[:, :, [-6, -5]] * y_pred[:, :,
                                                 [-4, -3]] + y_pred[:, :,
                                                                    [-8, -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                y_pred_decoded_raw[:, :, [-2, -1]] *
                y_pred[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        y_pred_decoded_raw = convert_coordinates(
            y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw,
                                                 start_index=-4,
                                                 conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'."
        )

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:, :, [
            -4, -2
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [
            -3, -1
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: For each batch item, prepend each box's internal index to its coordinates.

    y_pred_decoded_raw2 = np.zeros(
        (y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1],
         y_pred_decoded_raw.shape[2] + 1))  # Expand the last axis by one.
    y_pred_decoded_raw2[:, :, 1:] = y_pred_decoded_raw
    y_pred_decoded_raw2[:, :, 0] = np.arange(
        y_pred_decoded_raw.shape[1]
    )  # Put the box indices as the first element for each box via broadcasting.
    y_pred_decoded_raw = y_pred_decoded_raw2

    # 4: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[
        -1] - 5  # The number of classes is the length of the last axis minus the four box coordinates and minus the index

    y_pred_decoded = []  # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(
                1, n_classes
        ):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [
                0, class_id + 1, -4, -3, -2, -1
            ]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
            threshold_met = single_class[
                single_class[:, 1] >
                confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms_debug(
                    threshold_met,
                    iou_threshold=iou_threshold,
                    coords='corners',
                    border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros(
                    (maxima.shape[0], maxima.shape[1] + 1)
                )  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,
                              0] = maxima[:,
                                          0]  # Write the box index to the first column...
                maxima_output[:,
                              1] = class_id  # ...and write the class ID to the second column...
                maxima_output[:,
                              2:] = maxima[:,
                                           1:]  # ...and write the rest of the maxima data to the other columns...
                pred.append(
                    maxima_output
                )  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        pred = np.concatenate(pred, axis=0)
        if pred.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
            top_k_indices = np.argpartition(
                pred[:, 2], kth=pred.shape[0] - top_k, axis=0
            )[pred.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-score maxima...
            pred = pred[
                top_k_indices]  # ...and keep only those entries of `pred`...
        y_pred_decoded.append(
            pred
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
コード例 #13
0
def decode_detections_fast(y_pred,
                           confidence_thresh=0.5,
                           iou_threshold=0.45,
                           top_k='all',
                           input_coords='centroids',
                           normalize_coords=True,
                           img_height=None,
                           img_width=None,
                           border_pixels='half'):

    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`"
            .format(img_height, img_width))

    # 1: Convert the classes from one-hot encoding to their class ID
    y_pred_converted = np.copy(
        y_pred[:, :, -14:-8]
    )  # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
    y_pred_converted[:, :, 0] = np.argmax(
        y_pred[:, :, :-12], axis=-1
    )  # The indices of the highest confidence values in the one-hot class vectors are the class ID
    y_pred_converted[:, :, 1] = np.amax(
        y_pred[:, :, :-12],
        axis=-1)  # Store the confidence values themselves, too

    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
    if input_coords == 'centroids':
        y_pred_converted[:, :, [4, 5]] = np.exp(
            y_pred_converted[:, :, [4, 5]] * y_pred[:, :, [-2, -1]]
        )  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_converted[:, :, [4, 5]] *= y_pred[:, :, [
            -6, -5
        ]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_converted[:, :, [
            2, 3
        ]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [
            -6, -5
        ]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_converted[:, :, [2, 3]] += y_pred[:, :, [
            -8, -7
        ]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_converted = convert_coordinates(y_pred_converted,
                                               start_index=-4,
                                               conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_converted[:, :,
                         2:] *= y_pred[:, :,
                                       -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [4, 5]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :,
                         2:] += y_pred[:, :, -8:
                                       -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_converted = convert_coordinates(y_pred_converted,
                                               start_index=-4,
                                               conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_converted[:, :,
                         2:] *= y_pred[:, :,
                                       -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 4]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [3, 5]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :,
                         2:] += y_pred[:, :, -8:
                                       -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'."
        )

    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
    if normalize_coords:
        y_pred_converted[:, :, [
            2, 4
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_converted[:, :, [
            3, 5
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
    y_pred_decoded = []
    for batch_item in y_pred_converted:  # For each image in the batch...
        boxes = batch_item[np.nonzero(
            batch_item[:, 0]
        )]  # ...get all boxes that don't belong to the background class,...
        boxes = boxes[
            boxes[:, 1] >=
            confidence_thresh]  # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
        if iou_threshold:  # ...if an IoU threshold is set...
            boxes = _greedy_nms2(boxes,
                                 iou_threshold=iou_threshold,
                                 coords='corners',
                                 border_pixels=border_pixels
                                 )  # ...perform NMS on the remaining boxes.
        if top_k != 'all' and boxes.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point...
            top_k_indices = np.argpartition(
                boxes[:, 1], kth=boxes.shape[0] - top_k, axis=0
            )[boxes.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-scoring boxes...
            boxes = boxes[top_k_indices]  # ...and keep only those boxes...
        y_pred_decoded.append(
            boxes
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
コード例 #14
0
    def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps=None,
                                        this_offsets=None):
        '''
        Arguments:
            feature_map_size (tuple): [feature_map_height, feature_map_width]
            aspect_ratios (list): 生成的anchor boxes的比例
            this_scale (float),next_scale (float): A float in [0, 1]
        Returns:
            (feature_map_height, feature_map_width, n_boxes_per_cell, 4)   4:坐标
        '''

        size = min(self.img_height, self.img_width)
        # 计算所有比例的box的宽和高
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    box_height = box_width = np.sqrt(
                        this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)  #每个格子(cell)中有多少boxes

        # 计算box中心
        if (this_steps is None):
            step_height = self.img_height / feature_map_size[0]
            step_width = self.img_width / feature_map_size[1]
        else:
            if isinstance(this_steps,
                          (list, tuple)) and (len(this_steps) == 2):
                step_height = this_steps[0]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps

        # this_offsets:anchor box 中心距左上角的像素值
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets,
                          (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets
        # 计算default box中心坐标
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_size[0] - 1) *
                         step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_size[1] - 1) * step_width,
                         feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)  #生成网格
        cx_grid = np.expand_dims(cx_grid, -1)  # np.tile()
        cy_grid = np.expand_dims(cy_grid, -1)

        # (feature_map_height, feature_map_width, n_boxes, 4) 最后一维4:(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes))  # cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes))  # cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # h

        # 将 (cx, cy, w, h) 转换成 (xmin, ymin, xmax, ymax)格式
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # 剪裁超出图像边界的boxes
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # 将坐标归一化到 [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        if self.coords == 'centroids':
            # (xmin, ymin, xmax, ymax)->(cx, cy, w, h)
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # (xmin, ymin, xmax, ymax)->(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')
        return boxes_tensor
コード例 #15
0
    def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps=None,
                                        this_offsets=None,
                                        diagnostics=False):
        '''
        Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
        of size `feature_map_size == [feature_map_height, feature_map_width]`.

        Arguments:
            feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
                dimensions of the feature map for which to generate the anchor boxes.
            aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
                All list elements must be unique.
            this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
                as a fraction of the shorter side of the input image.
            next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
                `self.two_boxes_for_ar1 == True`.
            diagnostics (bool, optional): If true, the following additional outputs will be returned:
                1) A list of the center point `x` and `y` coordinates for each spatial location.
                2) A list containing `(width, height)` for each box aspect ratio.
                3) A tuple containing `(step_height, step_width)`
                4) A tuple containing `(offset_height, offset_width)`
                This information can be useful to understand in just a few numbers what the generated grid of
                anchor boxes actually looks like, i.e. how large the different boxes are and how dense
                their spatial distribution is, in order to determine whether the box grid covers the input images
                appropriately and whether the box sizes are appropriate to fit the sizes of the objects
                to be detected.

        Returns:
            A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
            last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
        '''
        # Compute box width and height for each aspect ratio.

        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(
                        this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (this_steps is None):
            step_height = self.img_height / feature_map_size[0]
            step_width = self.img_width / feature_map_size[1]
        else:
            if isinstance(this_steps,
                          (list, tuple)) and (len(this_steps) == 2):
                step_height = this_steps[0]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets,
                          (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_size[0] - 1) *
                         step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_size[1] - 1) * step_width,
                         feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(
            cx_grid, -1
        )  # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(
            cy_grid, -1
        )  # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes))  # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes))  # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        if diagnostics:
            return boxes_tensor, (cy,
                                  cx), wh_list, (step_height,
                                                 step_width), (offset_height,
                                                               offset_width)
        else:
            return boxes_tensor
コード例 #16
0
    def call(self, x, mask=None):
        '''
        根据输入 tensor 的形状, 计算 Anchor tensor.

        此处的逻辑和 `ssd_box_encode_decode_utils.py` 一样.

        请注意这些 tensor 并不参与相互误差传递的计算优化过程. 其值为常数. 因此, 所有的逻辑使用 Numpy 计算,只是在最后才
        转换为 Keras tensor 输出.

        Arguments:
            x (tensor): 4 维 tensor, 当 `dim_ordering = 'th'`, 形状为 `(batch, channels, height, width)`,
            当 `dim_ordering = 'tf'`, 形状为 `(batch, height, width, channels)`.
        '''

        # 计算每一个长宽比下, Anchor 的宽和高
        # 输入图像的较短的边长, 与`scale` 和 `aspect_ratios` 一起用于计算
        size = min(self.img_height, self.img_width)
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # 对应长宽比为 1 的情况计算 anchor .
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # 使用本层和下一层的 scale 的几何平均值计算稍微大一点的 Anchor 的尺寸
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        # 获取输入 tensor 的形状
        if K.image_dim_ordering() == 'tf':
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
        else:  # 因为只支持 TensorFlow, 下面这一句可以不要, 但是也不影响功能
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape

        # 计算 Anchor 的中心点位置, 对不同的长宽比, 中心点位置是同样的.

        # 计算步长. 计算两个相邻 Anchor 的中心的纵横方向的距离
        if (self.this_steps is None):
            step_height = self.img_height / feature_map_height
            step_width = self.img_width / feature_map_width
        else:
            if isinstance(self.this_steps,
                          (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # 计算偏移量. 第一个 Anchor 相对于输入图像的左上角的偏移量
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets,
                          (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        # 有了偏移量和步长, 计算 Anchor 的中心位置
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_height - 1) *
                         step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_width - 1) * step_width,
                         feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1)  # 为了如下的 np.tile() 做准备
        cy_grid = np.expand_dims(cy_grid, -1)  # 为了如下的 np.tile() 做准备

        # 产生一个 4 维的 tensor 模版, 形状为 `(feature_map_height, feature_map_width, n_boxes, 4)`
        # 最后 4 维的值为 `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                           (1, 1, self.n_boxes))  # 设置 cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                           (1, 1, self.n_boxes))  # 设置 cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # 设置 w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # 设置 h

        # 坐标转换 `(cx, cy, w, h)` 到 `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # 如果 `clip_boxes` 的值为 ‘True’, 剪切坐标, 使其的值在图像边界以内
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # 如果 `normalize_coords` 的值为 ‘True’, 归一化坐标, 使得其值在 [0,1] 之间
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        if self.coords == 'centroids':
            # 转换 `(xmin, ymin, xmax, ymax)` 到 `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # 转换 `(xmin, ymin, xmax, ymax)` 到 `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        # 创建一个 tensor 保护 variances , 将它添加到 `boxes_tensor` 中. 这个 tensor 的形状和 `boxes_tensor` 的一样
        variances_tensor = np.zeros_like(
            boxes_tensor
        )  # 形状为 `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances
        # 拼接以后 `boxes_tensor` 的形状为 `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # 在 `boxes_tensor` 前面加一个 维度, 作为 batch size 的维度
        # 拼接以后维度为 5 维, 形状为 `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #17
0
    def call(self, x, mask=None):
        '''
        Return an anchor box tensor based on the shape of the input tensor.

        The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.

        Note that this tensor does not participate in any graph computations at runtime. It is being created
        as a constant once during graph creation and is just being output along with the rest of the model output
        during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
        to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.

        Arguments:
            x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
                or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
                layer must be the output of the localization predictor layer.
        '''

        # Compute box width and height for each aspect ratio
        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        # We need the shape of the input tensor
        if K.image_data_format() == 'channels_last':
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x.shape.as_list(
            )
        else:  # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x.shape.as_list(
            )

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (self.this_steps is None):
            step_height = self.img_height / feature_map_height
            step_width = self.img_width / feature_map_width
        else:
            if isinstance(self.this_steps,
                          (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets,
                          (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_height - 1) *
                         step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_width - 1) * step_width,
                         feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(
            cx_grid, -1
        )  # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(
            cy_grid, -1
        )  # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                           (1, 1, self.n_boxes))  # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                           (1, 1, self.n_boxes))  # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
        # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
        variances_tensor = np.zeros_like(
            boxes_tensor
        )  # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances  # Long live broadcasting
        # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
        # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #18
0
def build_priors(loc_tensor,
                 img_dim,
                 this_scale,
                 next_scale,
                 aspect_ratios=[0.5, 1.0, 2.0],
                 two_boxes_for_ar1=True,
                 clip_boxes=False,
                 variances=[0.1, 0.1, 0.2, 0.2],
                 coords='centroids',
                 normalize_coords=False):
    '''
    A nuympy array containing anchor box coordinates and variances based on the
    input tensor and the passed arguments.

    A set of 2D anchor boxes of different aspect ratios is created for each
    spatial unit of the input tensor. The number of anchor boxes created per
    unit depends on the arguments `aspect_ratios` and `two_boxes_for_ar1`, in
    the default case it is 4. The boxes are parameterized by the coordinate
    tuple `(xmin, xmax, ymin, ymax)`.

    The logic implemented by this layer is identical to the logic in the module
    `ssd_box_encode_decode_utils.py`.

    The purpose of having this layer in the network is to make the model
    self-sufficient at inference time. Since the model is predicting offsets to
    the anchor boxes (rather than predicting absolute box coordinates directly),
    one needs to know the anchor box coordinates in order to construct the final
    prediction boxes from the predicted offsets. If the model's output tensor
    did not contain the anchor box coordinates, the necessary information to
    convert the predicted offsets back to absolute coordinates would be missing
    in the model output. The reason why it is necessary to predict offsets to
    the anchor boxes rather than to predict absolute box coordinates directly is
    explained in `README.md`.

    Input shape:
        4D tensor of shape `(batch, height, width, channels)`.

    Output shape:
        5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis
        contains the four anchor box coordinates and the four variance values
        for each box.

    All arguments need to be set to the same values as in the box encoding
    process, otherwise the behavior is undefined. Some of these arguments
    are explained in more detail in the documentation of the `SSDBoxEncoder`
    class.

    Required Arguments:

        loc_tensor (tensor):
            4D tensor of shape `(batch, height, width, channels)`.
            The output of the localization predictor layer.

        img_dim (int):
            The side length of the input images.
            NOTE: it is assumed that the input images have the same dimension
                  for width and height.

        this_scale (float):
            A float in [0, 1], the scaling factor for the size of the
            generated anchor boxes as a fraction of the shorter side of the
            input image.

        next_scale (float):
            A float in [0, 1], the next larger scaling factor. Only relevant
            if `two_boxes_for_ar1 == True`.

    Optional Arguments:

        aspect_ratios (list):
            The list of aspect ratios for which default boxes are to be
            generated for this layer.

        two_boxes_for_ar1 (bool):
            Only relevant if `aspect_ratios` contains 1. If `True`, two
            default boxes will be generated for aspect ratio 1. The first
            will be generated using the scaling factor for the respective
            layer, the second one will be generated using geometric mean of
            said scaling factor and next bigger scaling factor.

        clip_boxes (bool):
            If `True`, clips the anchor box coordinates to stay within image
            boundaries.

        variances (list):
            A list of 4 floats >0. The anchor box offset for each coordinate
            will be divided by its respective variance value.

        coords (str):
            The box coordinate format to be used internally in the model
            (i.e. this is not the input format of the ground truth labels).
            Can be either 'centroids' for the format `(cx, cy, w, h)`
            (box center coordinates, width, and height), 'corners' for the
            format `(xmin, ymin, xmax,  ymax)`, or 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`.

        normalize_coords (bool):
            Set to `True` if the model uses relative instead of absolute
            coordinates, i.e. if the model predicts box coordinates within
            [0,1] instead of absolute coordinates.
    '''

    # =========================== Perform Checks ===========================

    if this_scale < 0 or next_scale < 0 or this_scale > 1:
        raise ValueError(
            ('`this_scale` must be in [0, 1] and `next_scale` must be >0, '
             'but `this_scale` == {}, `next_scale` == {}').format(
                 this_scale, next_scale))

    if len(variances) != 4:
        raise ValueError(
            ('4 variance values must be pased, but {} values were '
             'received.').format(len(variances)))
    variances = np.array(variances)
    if np.any(variances <= 0):
        raise ValueError(
            ('All variances must be >0, but the variances given are '
             '{}'.format(variances)))

    # Compute the number of boxes per cell
    n_boxes = len(aspect_ratios)
    if 1 in aspect_ratios and two_boxes_for_ar1:
        n_boxes += 1

    # Compute the box [width, height] for all aspect ratios
    wh_list = []
    scaled_dim = this_scale * img_dim
    for ar in aspect_ratios:
        if ar == 1:
            # Compute the regular anchor box for aspect ratio 1.
            box_dim = scaled_dim
            wh_list.append([box_dim] * 2)
            if two_boxes_for_ar1:
                # Compute one slightly larger version using the geometric
                # mean of this scale value and the next.
                box_dim = np.sqrt(this_scale * next_scale) * img_dim
                wh_list.append([box_dim] * 2)
        else:
            wh_list.append(
                [scaled_dim * np.sqrt(ar), scaled_dim / np.sqrt(ar)])
    wh_list = np.array(wh_list)

    # We need the shape of the input tensor
    feature_map_dim = loc_tensor.shape[1]

    # Compute the grid of box center points. They are identical for all
    # aspect ratios.

    # Compute the step sizes, i.e. how far apart the anchor box center
    # points will be vertically and horizontally.
    step_size = img_dim / feature_map_dim

    # Compute the offsets, i.e. at what pixel values the first anchor box
    # center point will be from the top and from the left of the image.
    offset_size = 0.5

    # Now that we have the offsets and step sizes, compute the grid of
    # anchor box center points.
    centers = np.linspace(offset_size * step_size,
                          (offset_size + feature_map_dim - 1) * step_size,
                          feature_map_dim)
    cx_grid, cy_grid = np.meshgrid(centers, centers)

    # Create a 4D tensor template of shape
    #   `(feature_map_dim, feature_map_dim, n_boxes, 4)`
    # where the last dimension will contain `(cx, cy, w, h)`
    boxes_tensor = np.zeros((feature_map_dim, feature_map_dim, n_boxes, 4))

    boxes_tensor[:, :, :, 0] = np.expand_dims(cx_grid, -1)  # Set cx
    boxes_tensor[:, :, :, 1] = np.expand_dims(cy_grid, -1)  # Set cy
    boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Set w (broadcast)
    boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Set h (broadcast)

    # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
    boxes_tensor = convert_coordinates(boxes_tensor,
                                       start_index=0,
                                       conversion='centroids2corners')

    # If `clip_boxes` is enabled, clip the coordinates to lie within the
    # image boundaries
    if clip_boxes:
        x_coords = boxes_tensor[:, :, :, [0, 2]]
        x_coords[x_coords >= img_dim] = img_dim - 1
        x_coords[x_coords < 0] = 0
        boxes_tensor[:, :, :, [0, 2]] = x_coords
        y_coords = boxes_tensor[:, :, :, [1, 3]]
        y_coords[y_coords >= img_dim] = img_dim - 1
        y_coords[y_coords < 0] = 0
        boxes_tensor[:, :, :, [1, 3]] = y_coords

    # If `normalize_coords` is enabled, normalize the coordinates to be
    # within [0,1]
    if normalize_coords:
        boxes_tensor[:, :, :, [0, 2]] /= img_dim
        boxes_tensor[:, :, :, [1, 3]] /= img_dim

    # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we
    #       don't have to unnecessarily convert back and forth.
    if coords in ['centroids', 'minmax']:
        # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)` or to
        # `(xmin, xmax, ymin, ymax)`.
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='corners2' + coords,
                                           border_pixels='half')

    # Create a tensor to contain the variances and append it to
    # `boxes_tensor`. This tensor has the same shape as `boxes_tensor` and
    # simply contains the same 4 variance values for every position in the
    # last axis. Has shape
    #   `(feature_map_dim, feature_map_dim, n_boxes, 4)`
    variances_tensor = np.zeros_like(boxes_tensor)
    variances_tensor += variances  # Long live broadcasting

    # Now `boxes_tensor` becomes a tensor of shape
    #   `(feature_map_dim, feature_map_dim, n_boxes, 8)`
    return np.concatenate((boxes_tensor, variances_tensor), axis=-1)
コード例 #19
0
ファイル: ssd_output_decoder.py プロジェクト: jbutle55/ssd
def decode_detections_fast(y_pred,
                           confidence_thresh=0.5,
                           iou_threshold=0.45,
                           top_k='all',
                           input_coords='centroids',
                           normalize_coords=True,
                           img_height=None,
                           img_width=None,
                           border_pixels='half'):
    '''
    Convert model prediction output back to a format that contains only the positive box predictions
    (i.e. the same format that `enconde_y()` takes as input).

    Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.

    Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
    For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
    all boxes for which the highest confidence is the background class. This results in less work for the subsequent
    non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
    their highest confidence is for the background class. It is much more efficient than the procedure of the original
    implementation, but the results may also differ.

    Arguments:
        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
            boxes predicted by the model per image and the last axis contains
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
            class required for a given box to be considered a positive prediction. A lower value will result
            in better recall, while a higher value will result in better precision. Do not use this parameter with the
            goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
            stage will take care of those.
        iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
            performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
            all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
            from the set of predictions, where 'maximal' refers to the box score.
        top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
            after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
            coordinates. Requires `img_height` and `img_width` if set to `True`.
        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
        a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))

    # 1: Convert the classes from one-hot encoding to their class ID
    y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
    y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
    y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too

    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
    if input_coords == 'centroids':
        y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")

    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
    if normalize_coords:
        y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
        y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates

    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
    y_pred_decoded = []
    for batch_item in y_pred_converted: # For each image in the batch...
        boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
        boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
        if iou_threshold: # ...if an IoU threshold is set...
            boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
        if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
            top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
            boxes = boxes[top_k_indices] # ...and keep only those boxes...
        y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
コード例 #20
0
ファイル: AnchorBox.py プロジェクト: rekkit/ssd
    def call(self, x, mask=None):
        # the shorter side of the image is used to compute the width and height of the anchor boxes using the scale and
        # aspect ratios
        size = min(self.img_h, self.img_w)

        # compute the anchor box widths / heights for each of the aspect ratios
        wh_list = []
        for ar in self.aspect_ratios:
            height = size * self.this_scale / np.sqrt(ar)
            width = size * self.this_scale * np.sqrt(ar)
            wh_list.append([height, width])

        if (1 in self.aspect_ratios) and self.two_boxes_for_ar1:
            height = width = size * np.sqrt(self.this_scale * self.next_scale)
            wh_list.append([height, width])

        wh_list = np.array(wh_list)

        # get the shape of the input tensor
        batch_size, feature_map_h, feature_map_w, n_channels = x._keras_shape

        # compute the grid points
        step_h = None
        step_w = None
        if self.steps is None:
            step_h = self.img_h / feature_map_h
            step_w = self.img_w / feature_map_w
        else:
            # otherwise steps are specified. We have two acceptable inputs here:
            # 1. we are given both step_h and step_w
            # 2. we are given a single value which we apply to both step_h and step_w
            if isinstance(self.steps, (list, tuple)):
                if len(self.steps) == 2:
                    step_h, step_w = self.steps
                else:
                    raise ValueError(
                        "Expected two values to be contained in the input 'steps', received {}."
                        .format(len(self.steps)))
            elif isinstance(self.steps, (int, float)):
                step_h = step_w = self.steps

        offset_h = None
        offset_w = None
        if self.offsets is None:
            offset_h = offset_w = 0.5
        else:
            if isinstance(self.offsets, (list, tuple)):
                if len(self.offsets) == 2:
                    offset_h = offset_w = self.offsets
                else:
                    raise ValueError(
                        "Expected two values to be contained in the input 'offsets', received {}."
                        .format(len(self.offsets)))
            elif isinstance(self.offsets, (int, float)):
                offset_h = offset_w = self.offsets

        # we have the offsets and step sizes. Time to generate the grid.
        cy = np.linspace(start=offset_h * step_h,
                         stop=(offset_h + feature_map_h - 1) * step_h,
                         num=feature_map_h)
        cx = np.linspace(start=offset_w * step_w,
                         stop=(offset_w + feature_map_w - 1) * step_w,
                         num=feature_map_w)

        # create the grid and extend the dimensions of each of the axes so we can tile them to get the anchor boxes
        # tensor
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1)
        cy_grid = np.expand_dims(cy_grid, -1)

        # now create the array that's going to hold the anchor boxes
        boxes_tensor = np.zeros(
            (feature_map_h, feature_map_w, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes))
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes))
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]

        # normalize coordinates if necessary
        if self.norm_coordinates:
            # convert to corners
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               conversion_type="%s2corners" %
                                               self.coordinates)

            # normalize
            boxes_tensor[:, :, :, [0, 2]] /= self.img_w
            boxes_tensor[:, :, :, [1, 3]] /= self.img_h

            # convert back to centroid format
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               conversion_type="corners2%s" %
                                               self.coordinates)

        # create a tensor to hold the variances
        variances_tensor = np.zeros_like(
            boxes_tensor)  # shape (fm_h, fm_w, n_boxes, 4)
        variances_tensor += self.variances

        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # now add a dimension to boxes_tensor. We need to have a copy of the 'current' boxes_tensor for each element
        # in the batch
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype="float32"),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #21
0
    def call(self, x, mask=None):
        '''
        Return: Trả về 1 anchor box tensor dựa trên shape của input tensor.

        Tensor này được thiết kế như là hằng số và không tham gia vào quá trình tính toán.

        Arguments:
            x (tensor): 4D tensor có shape `(batch, channels, height, width)` nếu `dim_ordering = 'th'`
                hoặc `(batch, height, width, channels)` nếu `dim_ordering = 'tf'`. Input cho layer này phải là output của các localization predictor layer.
        '''
        #####################################################
        # Bước 1: Tính toán with và heigth của box với mỗi aspect ratio
        #####################################################
        # Cạnh ngẵn hơn của hình ảnh có thể được sử dụng để tính `w` và `h` sử dụng `scale` và `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Tính toán box widths và heights cho toàn bộ aspect ratios
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # Tính anchor box thông thường khi aspect ratio = 1.
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # Tính version lớn hơn của anchor box sử dụng the geometric mean của scale và next scale.
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                # Trường hợp còn lại box_height = scale/sqrt(aspect ratio); box_width = scale*sqrt(aspect ratio)
                box_height = self.this_scale * size // np.sqrt(ar)
                box_width = int(self.this_scale * size * np.sqrt(ar))
                wh_list.append((box_width, box_height))
        # append vào width height list
        wh_list = np.array(wh_list)

        # Định hình input shape
        if K.image_data_format() == 'tf':
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x.get_shape(
            ).as_list()
        else:
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x.get_shape(
            ).as_list()

        # Tính các center points của grid of box. Chúng là duy nhất đối với các aspect ratios.
        #####################################################
        # Bước 2: Tính các step size. Khoảng cách là bao xa giữa các anchor box center point theo chiều width và height.
        #####################################################
        if (self.this_steps is None):
            step_height = self.img_height // feature_map_height
            step_width = self.img_width // feature_map_width
        else:
            if isinstance(self.this_steps,
                          (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # Tính toán các offsets cho anchor box center point đầu tiên từ góc trên cùng bên trái của hình ảnh.
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets,
                          (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        #####################################################
        # Bước 3: Tính toán các tọa độ của (cx, cy, w, h) theo tọa độ của image gốc.
        #####################################################
        # Bây h chúng ta có các offsets và step sizes, tính grid của anchor box center points.
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_height - 1) *
                         step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_width - 1) * step_width,
                         feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1)
        cy_grid = np.expand_dims(cy_grid, -1)

        # Tạo một 4D tensor có shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # Chiều cuối cùng sẽ chứa `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                           (1, 1, self.n_boxes))  # đặt cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                           (1, 1, self.n_boxes))  # đặt cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # đặt w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # đặt h

        # Chuyển `(cx, cy, w, h)` sang `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # Nếu `clip_boxes` = True, giới hạn các tọa độ nằm trên boundary của hình ảnh
        if self.clip_boxes:
            x_coords = boxes_tensor[:, :, :, [0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:, :, :, [0, 2]] = x_coords
            y_coords = boxes_tensor[:, :, :, [1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:, :, :, [1, 3]] = y_coords

        # Nếu `normalize_coords` = True, chuẩn hóa các tọa độ nằm trong khoảng [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2centroids',
                                               border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor,
                                               start_index=0,
                                               conversion='corners2minmax',
                                               border_pixels='half')

        # Tạo một tensor chứa các variances và append vào `boxes_tensor`.
        variances_tensor = np.zeros_like(
            boxes_tensor
        )  # shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances  # Mở rộng thêm variances
        # Bây h `boxes_tensor` trở thành tensor kích thước `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # Bây h chuẩn bị trước một chiều cho `boxes_tensor` đại diện cho batch size và di chuyển copy theo chiều đó (theo kiểu lợp ngói, xem thêm np.tile)
        #  ta được một 5D tensor kích thước `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
コード例 #22
0
def decode_detections(y_pred,
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      input_coords='centroids',
                      normalize_coords=True,
                      img_height=None,
                      img_width=None,
                      border_pixels='half'):
    '''
    将模型的输出转换为只包含正样本预测值的格式, 与`SSDInputEncoder`的输入格式一样

    解码以后, 为每一个类别, 有两个处理步骤:
    1. 根据 confidence_thresh 筛除概率小的框, 2. non-maximum suppression
    在处理完所有类别的以后, 将结果连接起来, 具有最大的概率的`top_k`个值就是每一幅图的最后结果.
    这个实现和原版 Caffe 的原理一样, 想更优化的实现, 请见 `decode_detections_fast()`, 
    只是它是针对所有类别, 而不是对某一个类别.

    Arguments:
        y_pred (array): SSD 的输出, 期望是 Numpy array, 形状为 `(batch_size, #boxes, #classes + 4 + 4 + 4)`, 
            其中 `#boxes` 模型为每一幅图预测的边界框的总数. 最后的维度包含的值为:
            `[one-hot 编码的类别, 预测边界框的 4 个相对坐标, anchor 的 4 个坐标, 4 个 variances 的值]`.
        confidence_thresh (float, optional): [0,1) 之间的实数, 能够被保留进入non-maximum suppression步骤的分类概率的最小值.
            值越小, 进入non-maximum suppression步骤的边界框数量越多.
        iou_threshold (float, optional): [0,1] 之间的实数. 所有与具有最大概率的边界框IoU大于iou_threshold都会被筛除掉.
        top_k (int, optional): 在non-maximum suppression步骤之后保留的具有最大概率的边界框的个数.
        input_coords (str, optional): 模型输出的坐标格式. 可以为 'centroids' `(cx, cy, w, h)`, 后者 'minmax'
            `(xmin, xmax, ymin, ymax)`, 或者 'corners' `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): 如果模型输出坐标为相对值 (值在[0,1]), 而且你希望将相对值转换为绝对值, 则设为 `True`. 如果模型输出为
            相对值, 而且你不希望将相对值转换为绝对值, 则设为 `False`. 如果模型输出为绝对值, 设置为 `False`.
            如果设置为`True`, 则同时需要设置 `img_height` 和 `img_width` .
        img_height (int, optional): 图像的高度. 当设置 `normalize_coords` 为 `True` 是需要.
        img_width (int, optional): 图像的宽度. 当设置 `normalize_coords` 为 `True` 是需要.
        border_pixels (str, optional): 如何处理边界框边界像素. 取值可以为 'include', 'exclude', 或者 'half'.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
        a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`"
            .format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(
        y_pred[:, :, :-8]
    )  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
            y_pred_decoded_raw[:, :, [-2, -1]] * y_pred[:, :, [-2, -1]]
        )  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_decoded_raw[:, :, [-2, -1]] *= y_pred[:, :, [
            -6, -5
        ]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_decoded_raw[:, :, [
            -4, -3
        ]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [
            -6, -5
        ]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_decoded_raw[:, :, [-4, -3]] += y_pred[:, :, [
            -8, -7
        ]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_decoded_raw = convert_coordinates(
            y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw,
                                                 start_index=-4,
                                                 conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'."
        )

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:, :, [
            -4, -2
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [
            -3, -1
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[
        -1] - 4  # The number of classes is the length of the last axis minus the four box coordinates

    y_pred_decoded = []  # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(
                1, n_classes
        ):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [
                class_id, -4, -3, -2, -1
            ]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
            threshold_met = single_class[
                single_class[:, 0] >
                confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms(
                    threshold_met,
                    iou_threshold=iou_threshold,
                    coords='corners',
                    border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros(
                    (maxima.shape[0], maxima.shape[1] + 1)
                )  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,
                              0] = class_id  # Write the class ID to the first column...
                maxima_output[:,
                              1:] = maxima  # ...and write the maxima to the other columns...
                pred.append(
                    maxima_output
                )  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        if pred:  # If there are any predictions left after confidence-thresholding...
            pred = np.concatenate(pred, axis=0)
            if top_k != 'all' and pred.shape[
                    0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
                top_k_indices = np.argpartition(
                    pred[:, 1], kth=pred.shape[0] - top_k, axis=0
                )[pred.shape[0] -
                  top_k:]  # ...get the indices of the `top_k` highest-score maxima...
                pred = pred[
                    top_k_indices]  # ...and keep only those entries of `pred`...
        else:
            pred = np.array(
                pred)  # Even if empty, `pred` must become a Numpy array.
        y_pred_decoded.append(
            pred
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded