def resize_crop_pad(image, desired_output_size, stride, aug_scale_min=1.0, aug_scale_max=1.0, boxes=None, classes=None, masks=None, crop_mask_size=112): """Resize, crop and pad images, boxes and masks (RetinaNet style). Resize, crop and pad images, (optionally boxes and masks) given the desired output size of the image and the stride size. Here are the preprocessing steps. 1. For a given image, keep its aspect ratio and rescale the image to make it the largest rectangle to be bounded by the rectangle specified by the `desired_output_size`. 2. Pad the rescaled image such that the height and width of the image become the smallest multiple of the stride that is larger or equal to the desired output diemension. Args: image: an image tensor of shape [original_height, original_width, 3]. desired_output_size: a tuple of two integers indicating the desired output image size. Note that the actual output size could be different from this. stride: the stride of the backbone network. Each of the output image sides must be the multiple of this. aug_scale_min: a `float` with range between [0, 1.0] representing minimum random scale applied to desired_size for training scale jittering. aug_scale_max: a `float` with range between [1.0, inf] representing maximum random scale applied to desired_size for training scale jittering. boxes: (Optional) a tensor of shape [num_boxes, 4] represneting the box corners in normalized coordinates. classes: (Optional) a tensor of shape [num_boxes] representing the box classes. masks: (Optional) a tensor of shape [num_boxes, image_height, image_width] representing the instance masks which have the same shape as the input image. crop_mask_size: an integer indicating the size of the cropped mask. Returns: image: the processed image tensor after being resized and padded. image_info: a tensor of shape [5] which encodes the height, width before and after resizing and the scaling factor. boxes: None or the processed box tensor after being resized and padded. After the processing, boxes will be in the absolute coordinates w.r.t. the scaled image. classes: None or the processed class tensor after boxes being resized and filtered. masks: None or the processed mask tensor after being resized. """ if boxes is not None: assert classes is not None input_shape = tf.shape(image) input_height = tf.cast(input_shape[0], dtype=tf.float32) input_width = tf.cast(input_shape[1], dtype=tf.float32) desired_height, desired_width = desired_output_size # Find the scale factor such that the scaled image is surrounded by the # rectangle of shape of desired_output_size. scale_if_resize_height = desired_height / input_height scale_if_resize_width = desired_width / input_width scale = tf.minimum(scale_if_resize_height, scale_if_resize_width) desired_scaled_height = scale * input_height desired_scaled_width = scale * input_width desired_scaled_size = tf.stack( [desired_scaled_height, desired_scaled_width], axis=0) random_jittering = aug_scale_min != 1.0 or aug_scale_max != 1.0 if random_jittering: random_scale = tf.random_uniform([], aug_scale_min, aug_scale_max) scale = random_scale * scale scaled_size = tf.round(random_scale * desired_scaled_size) else: scaled_size = desired_scaled_size scaled_size_int = tf.cast(scaled_size, dtype=tf.int32) desired_scaled_size_int = tf.cast(desired_scaled_size, dtype=tf.int32) image = tf.image.resize_images( image, scaled_size_int, method=tf.image.ResizeMethod.BILINEAR) if boxes is not None: normalized_boxes = boxes # Convert the normalized coordinates to the coordinates w.r.t. # the scaled image. boxes = boxes * tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2]) if masks is not None and not random_jittering: num_instances = tf.shape(boxes)[0] masks = tf.image.crop_and_resize( image=tf.expand_dims(masks, axis=-1), boxes=normalized_boxes, box_indices=tf.range(num_instances, dtype=tf.int32), crop_size=[crop_mask_size, crop_mask_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) if random_jittering: max_offset = scaled_size - desired_scaled_size max_offset = tf.where( tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) offset = tf.cast( max_offset * tf.random_uniform((2,), 0, 1), dtype=tf.int32) image = image[ offset[0]:offset[0] + desired_scaled_size_int[0], offset[1]:offset[1] + desired_scaled_size_int[1], :] if boxes is not None: box_offsets = tf.cast( tf.tile(tf.expand_dims(offset, axis=0), [1, 2]), dtype=tf.float32) boxes -= box_offsets boxes = box_utils.clip_boxes( boxes, desired_scaled_size_int[0], desired_scaled_size_int[1]) indices = tf.where(tf.logical_and( tf.greater(boxes[:, 2] - boxes[:, 0], 0), tf.greater(boxes[:, 3] - boxes[:, 1], 0)))[:, 0] boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if masks is not None: masks = tf.gather(masks, indices) # Convert the processed boxes back to the normalized coordinates w.r.t. # the original image in order to crop and resize the instance masks. cropped_boxes = boxes + box_offsets cropped_boxes /= tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2]) num_instances = tf.shape(boxes)[0] masks = tf.image.crop_and_resize( image=tf.expand_dims(masks, axis=-1), boxes=cropped_boxes, box_indices=tf.range(num_instances, dtype=tf.int32), crop_size=[crop_mask_size, crop_mask_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Pad image such that its height and width are the closest multiple of stride. padded_height = int(math.ceil(desired_height * 1.0 / stride) * stride) padded_width = int(math.ceil(desired_width * 1.0 / stride) * stride) image = tf.image.pad_to_bounding_box( image, 0, 0, padded_height, padded_width) image.set_shape([padded_height, padded_width, 3]) # desired_scaled_size is the actual image size. Pixels beyond this are from # padding. image_info = tf.stack([ desired_scaled_size[0], desired_scaled_size[1], 1.0 / scale, input_height, input_width]) return image, image_info, boxes, classes, masks
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: a tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores stacked in batch_size. """ with tf.name_scope('generate_detections'): batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list( ) softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) * tf.ones([1, 1, num_classes - 1, 1])) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height = tf.expand_dims(image_info[:, 0:1], axis=-1) width = tf.expand_dims(image_info[:, 1:2], axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) (post_nms_boxes, post_nms_scores, post_nms_classes, post_nms_num_valid_boxes) = (tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates( post_nms_boxes, height, width) return (post_nms_num_valid_boxes, post_nms_boxes, tf.to_float(post_nms_classes), post_nms_scores)
def generate_detections_per_image_tpu(cls_outputs, box_outputs, anchor_boxes, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections per image given the model outputs. Args: cls_outputs: a tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: Tuple of tensors corresponding to number of valid boxes, box coordinates, object categories for each boxes, and box scores -- respectively. """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Remove background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] class_indices = classes box_outputs = tf.gather_nd( box_outputs, tf.stack([top_k_indices, class_indices], axis=1)) # apply bounding box regression to anchors boxes = box_utils.decode_boxes(box_outputs, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, image_info[0], image_info[1]) list_of_all_boxes = [] list_of_all_scores = [] list_of_all_classes = [] # Skip background class. for class_i in range(num_classes): # Compute bitmask for the given classes. class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype) # This works because score is in [0, 1]. class_i_scores = top_k_scores * class_i_bitmask # The TPU and CPU have different behaviors for # tf.image.non_max_suppression_padded (b/116754376). (class_i_post_nms_indices, class_i_nms_num_valid) = tf.image.non_max_suppression_padded( tf.to_float(boxes), tf.to_float(class_i_scores), post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.05, pad_to_max_output_size=True, name='nms_detections_' + str(class_i)) class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices) class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices) mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid]) class_i_post_nms_scores = tf.where( mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)) class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i + 1) list_of_all_boxes.append(class_i_post_nms_boxes) list_of_all_scores.append(class_i_post_nms_scores) list_of_all_classes.append(class_i_classes) post_nms_boxes = tf.concat(list_of_all_boxes, axis=0) post_nms_scores = tf.concat(list_of_all_scores, axis=0) post_nms_classes = tf.concat(list_of_all_classes, axis=0) # sort all results. post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores), k=post_nms_num_detections, sorted=True) post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices) post_nms_classes = tf.gather(post_nms_classes, sorted_indices) valid_mask = tf.where(tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores), tf.zeros_like(post_nms_scores)) num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1) box_classes = tf.to_float(post_nms_classes) return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
def _propose_rois_gpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (GPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ batch_size, num_boxes = scores.get_shape().as_list() topk_limit = min(num_boxes, rpn_pre_nms_topn) boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) boxes = box_utils.clip_boxes(boxes, height, width) if rpn_min_size > 0.0: boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: # Normalize coordinates as combined_non_max_suppression currently # only support normalized coordinates. pre_nms_boxes = box_utils.to_normalized_coordinates( boxes, height, width) pre_nms_boxes = tf.reshape(pre_nms_boxes, [batch_size, num_boxes, 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, 1]) boxes, scores, _, _ = tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=topk_limit, max_total_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold, score_threshold=0.0, pad_per_class=False) boxes = box_utils.to_absolute_coordinates(boxes, height, width) else: scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def _propose_rois_tpu(scores, boxes, anchor_boxes, height, width, scale, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, bbox_reg_weights): """Proposes RoIs giva group of candidates (TPU version). Args: scores: a tensor with a shape of [batch_size, num_boxes]. boxes: a tensor with a shape of [batch_size, num_boxes, 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, num_boxes, 4]. height: a tensor of shape [batch_size, 1, 1] representing the image height. width: a tensor of shape [batch_size, 1, 1] representing the image width. scale: a tensor of shape [batch_size, 1, 1] representing the image scale. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). bbox_reg_weights: None or a list of four integer specifying the weights used when decoding the box. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ _, num_boxes = scores.get_shape().as_list() topk_limit = (num_boxes if num_boxes < rpn_pre_nms_topn else rpn_pre_nms_topn) scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] # Decode boxes w.r.t. anchors and transform to the absoluate coordinates. boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes that exceed the boundary. boxes = box_utils.clip_boxes(boxes, height, width) # Filter boxes that one side is less than rpn_min_size threshold. boxes, scores = box_utils.filter_boxes(boxes, tf.expand_dims(scores, axis=-1), rpn_min_size, height, width, scale) scores = tf.squeeze(scores, axis=-1) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) # NMS. if rpn_nms_threshold > 0: scores, boxes = box_utils.sorted_non_max_suppression_padded( scores, boxes, max_output_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold) # Pick top-K post NMS'ed boxes. scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def generate_detections_per_image_op(cls_outputs, box_outputs, anchor_boxes, image_id, image_info, num_detections=100, pre_nms_num_detections=1000, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generates detections with model outputs and anchors. Args: cls_outputs: a Tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a Tensor with shape [N, 4] or [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. The tensor shape is [N, num_classes*4] when class specific box regression is used. anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: an integer number to specify the image id. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] num_detections: Number of detections after NMS. pre_nms_num_detections: Number of candidates before NMS. nms_threshold: a float number to specify the threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: detection results in a tensor with each row representing [image_id, ymin, xmin, ymax, xmax, score, class] """ num_boxes, num_classes = cls_outputs.get_shape().as_list() _, num_box_predictions = box_outputs.get_shape().as_list() use_class_specific_box_regression = (num_classes == num_box_predictions / 4) # Remove background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=False) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) if use_class_specific_box_regression: box_outputs = tf.reshape(box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] class_indices = classes else: box_outputs = tf.reshape(box_outputs, [num_boxes, 1, 4]) class_indices = tf.zeros_like(top_k_indices) box_outputs = tf.gather_nd( box_outputs, tf.stack([top_k_indices, class_indices], axis=1)) # apply bounding box regression to anchors boxes = box_utils.batch_decode_box_outputs_op( tf.expand_dims(anchor_boxes, axis=0), tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0] boxes = box_utils.clip_boxes(tf.expand_dims(boxes, axis=0), tf.expand_dims(image_info[:2], axis=0))[0] list_of_all_boxes = [] list_of_all_scores = [] list_of_all_classes = [] # Skip background class. for class_i in range(num_classes): # Compute bitmask for the given classes. class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype) # This works because score is in [0, 1]. class_i_scores = top_k_scores * class_i_bitmask # The TPU and CPU have different behaviors for # tf.image.non_max_suppression_padded (b/116754376). (class_i_post_nms_indices, class_i_nms_num_valid) = tf.image.non_max_suppression_padded( tf.to_float(boxes), tf.to_float(class_i_scores), num_detections, iou_threshold=nms_threshold, score_threshold=0.05, pad_to_max_output_size=True, name='nms_detections_' + str(class_i)) class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices) class_i_post_nms_scores = tf.gather(class_i_scores, class_i_post_nms_indices) mask = tf.less(tf.range(num_detections), [class_i_nms_num_valid]) class_i_post_nms_scores = tf.where( mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores)) class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i + 1) list_of_all_boxes.append(class_i_post_nms_boxes) list_of_all_scores.append(class_i_post_nms_scores) list_of_all_classes.append(class_i_classes) post_nms_boxes = tf.concat(list_of_all_boxes, axis=0) post_nms_scores = tf.concat(list_of_all_scores, axis=0) post_nms_classes = tf.concat(list_of_all_classes, axis=0) # sort all results. post_nms_scores, sorted_indices = tf.nn.top_k(tf.to_float(post_nms_scores), k=num_detections, sorted=True) post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices) post_nms_classes = tf.gather(post_nms_classes, sorted_indices) if isinstance(image_id, int): image_id = tf.constant(image_id) image_id = tf.reshape(image_id, []) detections_result = tf.stack([ tf.to_float(tf.fill(tf.shape(post_nms_scores), image_id)), post_nms_boxes[:, 0], post_nms_boxes[:, 1], post_nms_boxes[:, 2], post_nms_boxes[:, 3], post_nms_scores, tf.to_float(post_nms_classes), ], axis=1) return detections_result
def _proposal_op_per_level(scores, boxes, anchor_boxes, image_info, rpn_pre_nms_topn, rpn_post_nms_topn, rpn_nms_threshold, rpn_min_size, level): """Proposes RoIs for the second stage nets. This proposal op performs the following operations. 1. for each location i in a (H, W) grid: generate A anchor boxes centered on cell i apply predicted bbox deltas to each of the A anchors at cell i 2. clip predicted boxes to image 3. remove predicted boxes with either height or width < threshold 4. sort all (proposal, score) pairs by score from highest to lowest 5. take the top rpn_pre_nms_topn proposals before NMS 6. apply NMS with a loose threshold (0.7) to the remaining proposals 7. take after_nms_topN proposals after NMS 8. return the top proposals Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/ops/generate_proposals.py # pylint: disable=line-too-long Args: scores: a tensor with a shape of [batch_size, height, width, num_anchors]. boxes: a tensor with a shape of [batch_size, height, width, num_anchors * 4], in the encoded form. anchor_boxes: an Anchors object that contains the anchors with a shape of [batch_size, height, width, num_anchors * 4]. image_info: a tensor of shape [batch_size, 5] where the three columns encode the input image's [height, width, scale, original_height, original_width]. Height and width are for the input to the network, not the original image; scale is the scale factor used to scale the network input size to the original image size. See dataloader.DetectionInputProcessor for details. The last two are original height and width. See dataloader.DetectionInputProcessor for details. rpn_pre_nms_topn: a integer number of top scoring RPN proposals to keep before applying NMS. This is *per FPN level* (not total). rpn_post_nms_topn: a integer number of top scoring RPN proposals to keep after applying NMS. This is the total number of RPN proposals produced. rpn_nms_threshold: a float number between 0 and 1 as the NMS threshold used on RPN proposals. rpn_min_size: a integer number as the minimum proposal height and width as both need to be greater than this number. Note that this number is at origingal image scale; not scale used during training or inference). level: a integer number for the level that the function operates on. Returns: scores: a tensor with a shape of [batch_size, rpn_post_nms_topn, 1] representing the scores of the proposals. It has same dtype as input scores. boxes: a tensor with a shape of [batch_size, rpn_post_nms_topn, 4] represneting the boxes of the proposals. The boxes are in normalized coordinates with a form of [ymin, xmin, ymax, xmax]. It has same dtype as input boxes. """ with tf.name_scope('proposal-l%d' % level): # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take the top rpn_pre_nms_topn proposals before NMS batch_size, h, w, num_anchors = scores.get_shape().as_list() scores = tf.reshape(scores, [batch_size, -1]) boxes = tf.reshape(boxes, [batch_size, -1, 4]) # Map scores to [0, 1] for convenince of setting min score. scores = tf.sigmoid(scores) topk_limit = (h * w * num_anchors if h * w * num_anchors < rpn_pre_nms_topn else rpn_pre_nms_topn) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, -1, 4]) scores, boxes_list = box_utils.top_k(scores, k=topk_limit, boxes_list=[boxes, anchor_boxes]) boxes = boxes_list[0] anchor_boxes = boxes_list[1] # Transforms anchors into proposals via bbox transformations. boxes = box_utils.batch_decode_box_outputs_op(anchor_boxes, boxes) # 2. clip proposals to image (may result in proposals with zero area # that will be removed in the next step) boxes = box_utils.clip_boxes(boxes, image_info[:, :2]) # 3. remove predicted boxes with either height or width < min_size scores, boxes = box_utils.filter_boxes(scores, boxes, rpn_min_size, image_info) # 6. apply loose nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) post_nms_topk_limit = (topk_limit if topk_limit < rpn_post_nms_topn else rpn_post_nms_topn) if rpn_nms_threshold > 0: scores, boxes = box_utils.sorted_non_max_suppression_padded( scores, boxes, max_output_size=post_nms_topk_limit, iou_threshold=rpn_nms_threshold) scores, boxes = box_utils.top_k(scores, k=post_nms_topk_limit, boxes_list=[boxes]) boxes = boxes[0] return scores, boxes
def _forward_test(self, input): cnn_features = input arg = easydict.EasyDict({ 'clip_boxes': self.test_clip_boxes, 'nms_thresh': self.test_nms_thresh, 'max_proposals': self.test_max_proposals }) # Make sure that setImageSize has been called assert self.image_height and self.image_width and not self._called_forward_size, \ 'Must call setImageSize before each forward pass' self._called_forward_size = True rpn_out, act_reg = self.rpn.forward(cnn_features) rpn_boxes, rpn_anchors, rpn_trans, rpn_scores = rpn_out num_boxes = rpn_boxes.size(1) # Maybe clip boxes to image boundary if arg.clip_boxes: bounds = { 'x_min': 1, 'y_min': 1, 'x_max': self.image_width, 'y_max': self.image_height } rpn_boxes, valid = box_utils.clip_boxes(rpn_boxes, bounds, 'xcycwh') #print(string.format('%d/%d boxes are predicted valid', # torch.sum(valid), valid:nElement())) #Clamp parallel arrays only to valid boxes (not oob of the image) rpn_boxes = self.clamp_data(rpn_boxes, valid) rpn_anchors = self.clamp_data(rpn_anchors, valid) rpn_trans = self.clamp_data(rpn_trans, valid) rpn_scores = self.clamp_data(rpn_scores, valid) num_boxes = rpn_boxes.size(1) # Convert rpn boxes from (xc, yc, w, h) format to (x1, y1, x2, y2) rpn_boxes_x1y1x2y2 = box_utils.xcycwh_to_x1y1x2y2(rpn_boxes[0]) # Convert objectness positive / negative scores to probabilities rpn_scores_exp = torch.exp(rpn_scores) pos_exp = rpn_scores_exp[0, :, 0] neg_exp = rpn_scores_exp[0, :, 1] scores = (pos_exp + neg_exp).pow(-1) * pos_exp verbose = False if verbose: print('in LocalizationLayer forward_test') print('Before NMS there are %d boxes' % num_boxes) print('Using NMS threshold %f' % arg.nms_thresh) #Run NMS and sort by objectness score boxes_scores = torch.cat((rpn_boxes_x1y1x2y2, scores.view(-1, 1)), dim=1) if arg.max_proposals == -1: idx = box_utils.nms(boxes_scores.data, arg.nms_thresh) else: idx = box_utils.nms(boxes_scores.data, arg.nms_thresh, arg.max_proposals) rpn_boxes_nms = torch.squeeze(rpn_boxes)[idx] if verbose: print('After NMS there are %d boxes' % rpn_boxes_nms.size(0)) output = rpn_boxes_nms return output
def generate_detections_gpu(class_outputs, box_outputs, anchor_boxes, image_id, image_info, pre_nms_num_detections=1000, post_nms_num_detections=100, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generate the final detections given the model outputs (GPU version). Args: class_outputs: a tensor with shape [batch_size, N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. box_outputs: a tensor with shape [batch_size, N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: a tensor with shape [batch_size] which specifies the image id of each image in the batch. image_info: a tensor of shape [batch_size, 5] which encodes each image's [height, width, scale, original_height, original_width]. pre_nms_num_detections: an integer that specifies the number of candidates before NMS. post_nms_num_detections: an integer that specifies the number of candidates after NMS. nms_threshold: a float number to specify the IOU threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: a tensor of [batch_size, post_nms_num_detections, 7], which stacks `post_nms_num_detections` number of detection results for each image in the batch. Each detection is stored in the format of [image_id, ymin, xmin, ymax, xmax, score, class] in the last dimension. """ batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list() softmax_class_outputs = tf.nn.softmax(class_outputs) # Remove background scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1]) boxes = tf.slice( tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]), [0, 0, 1, 0], [-1, -1, -1, -1]) anchor_boxes = tf.tile(tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) num_detections = num_boxes * (num_classes - 1) boxes = tf.reshape(boxes, [batch_size, num_detections, 4]) scores = tf.reshape(scores, [batch_size, num_detections, 1]) anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) # Decode boxes = box_utils.decode_boxes(boxes, anchor_boxes, bbox_reg_weights) # Clip boxes height, width, scale = tf.split(image_info[:, :3], num_or_size_splits=3, axis=-1) height = tf.expand_dims(height, axis=-1) width = tf.expand_dims(width, axis=-1) scale = tf.expand_dims(scale, axis=-1) boxes = box_utils.clip_boxes(boxes, height, width) pre_nms_boxes = tf.reshape(boxes, [batch_size, num_boxes, num_classes - 1, 4]) pre_nms_scores = tf.reshape(scores, [batch_size, num_boxes, num_classes - 1]) # NMS pre_nms_boxes = box_utils.to_normalized_coordinates( pre_nms_boxes, height, width) post_nms_boxes, post_nms_scores, post_nms_classes, valid_boxes = ( tf.image.combined_non_max_suppression( pre_nms_boxes, pre_nms_scores, max_output_size_per_class=pre_nms_num_detections, max_total_size=post_nms_num_detections, iou_threshold=nms_threshold, score_threshold=0.0, pad_per_class=False)) post_nms_classes = post_nms_classes + 1 post_nms_boxes = box_utils.to_absolute_coordinates(post_nms_boxes, height, width) # Only works with static batch size. # Unroll batch dimension. post_boxes_list = tf.unstack(post_nms_boxes) post_scores_list = tf.unstack(post_nms_scores) post_classes_list = tf.unstack(post_nms_classes) valid_boxes_list = tf.unstack(valid_boxes) image_id_list = tf.unstack(image_id) detections = [] for boxes_i, scores_i, classes_i, _, image_id_i in (zip( post_boxes_list, post_scores_list, post_classes_list, valid_boxes_list, image_id_list)): post_nms_top_k_scores = tf.reshape(scores_i, [post_nms_num_detections]) post_nms_top_k_boxes = tf.reshape(boxes_i, [post_nms_num_detections, 4]) post_nms_top_k_classes = tf.reshape(classes_i, [post_nms_num_detections]) this_batch_detections = tf.stack([ tf.to_float(tf.fill(tf.shape(post_nms_top_k_scores), image_id_i)), post_nms_top_k_boxes[:, 0], post_nms_top_k_boxes[:, 1], post_nms_top_k_boxes[:, 2], post_nms_top_k_boxes[:, 3], post_nms_top_k_scores, tf.to_float(post_nms_top_k_classes), ], axis=1) detections.append(this_batch_detections) detections = tf.stack(detections, axis=0) return detections
def generate_detections_per_image_op( cls_outputs, box_outputs, anchor_boxes, image_id, image_info, num_detections=100, pre_nms_num_detections=1000, nms_threshold=0.3, bbox_reg_weights=(10., 10., 5., 5.)): """Generates detections with model outputs and anchors. Args: cls_outputs: a Tensor with shape [N, num_classes], which stacks class logit outputs on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the cls_outputs should be the output of softmax(). box_outputs: a Tensor with shape [N, num_classes*4], which stacks box regression outputs on all feature levels. The N is the number of total anchors on all levels. anchor_boxes: a Tensor with shape [N, 4], which stacks anchors on all feature levels. The N is the number of total anchors on all levels. image_id: an integer number to specify the image id. image_info: a tensor of shape [5] which encodes the input image's [height, width, scale, original_height, original_width] num_detections: Number of detections after NMS. pre_nms_num_detections: Number of candidates before NMS. nms_threshold: a float number to specify the threshold of NMS. bbox_reg_weights: a list of 4 float scalars, which are default weights on (dx, dy, dw, dh) for normalizing bbox regression targets. Returns: detections: detection results in a tensor with each row representing [image_id, ymin, xmin, ymax, xmax, score, class] """ num_boxes, num_classes = cls_outputs.get_shape().as_list() # Removes background class scores. cls_outputs = cls_outputs[:, 1:num_classes] top_k_scores, top_k_indices_with_classes = tf.nn.top_k( tf.reshape(cls_outputs, [-1]), k=pre_nms_num_detections, sorted=True) classes = tf.mod(top_k_indices_with_classes, num_classes - 1) top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1) anchor_boxes = tf.gather(anchor_boxes, top_k_indices) box_outputs = tf.reshape( box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :] box_outputs = tf.gather_nd(box_outputs, tf.stack([top_k_indices, classes], axis=1)) # Applies bounding box regression to anchors. boxes = box_utils.batch_decode_box_outputs_op( tf.expand_dims(anchor_boxes, axis=0), tf.expand_dims(box_outputs, axis=0), bbox_reg_weights)[0] boxes = box_utils.clip_boxes( tf.expand_dims(boxes, axis=0), tf.expand_dims(image_info[:2], axis=0))[0] classes = tf.tile(tf.reshape(classes, [1, pre_nms_num_detections]), [num_classes - 1, 1]) scores = tf.tile(tf.reshape(top_k_scores, [1, pre_nms_num_detections]), [num_classes - 1, 1]) boxes = tf.tile(tf.reshape(boxes, [1, pre_nms_num_detections, 4]), [num_classes - 1, 1, 1]) class_bitmask = tf.tile( tf.reshape(tf.range(num_classes-1), [num_classes - 1, 1]), [1, pre_nms_num_detections]) scores = tf.where(tf.equal(classes, class_bitmask), scores, tf.zeros_like(scores)) scores = tf.where(tf.greater(scores, 0.05), scores, tf.zeros_like(scores)) # Reshape classes to be compartible with the top_k function. classes = tf.reshape(classes, [num_classes -1, pre_nms_num_detections, 1]) scores, sorted_tensors = box_utils.top_k( scores, k=pre_nms_num_detections, tensors=[boxes, classes]) boxes = sorted_tensors[0] classes = tf.reshape(sorted_tensors[1], [num_classes - 1, pre_nms_num_detections]) (post_nms_scores, post_nms_boxes, idx) = non_max_suppression.non_max_suppression_padded( scores, boxes, max_output_size=num_detections, iou_threshold=nms_threshold, level=0) # Sorts all results. sorted_scores, sorted_indices = tf.nn.top_k( tf.to_float(tf.reshape(post_nms_scores, [-1])), k=num_detections, sorted=True) post_nms_boxes = tf.gather(tf.reshape(post_nms_boxes, [-1, 4]), sorted_indices) classes = tf.batch_gather(classes, idx) post_nms_classes = tf.gather(tf.reshape(classes, [-1]), sorted_indices) + 1 if isinstance(image_id, int): image_id = tf.constant(image_id) image_id = tf.reshape(image_id, []) detections_result = tf.stack( [ tf.to_float(tf.fill(tf.shape(sorted_scores), image_id)), post_nms_boxes[:, 0], post_nms_boxes[:, 1], post_nms_boxes[:, 2], post_nms_boxes[:, 3], sorted_scores, tf.to_float(post_nms_classes), ], axis=1) return detections_result