def heatmaps_to_keypoints(maps, rois): """Extract predicted keypoint locations from heatmaps. Output has shape (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob) for each keypoint. """ # This function converts a discrete image coordinate in a HEATMAP_SIZE x # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain # consistency with keypoints_to_heatmap_labels by using the conversion from # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a # continuous coordinate. offset_x = rois[:, 0] offset_y = rois[:, 1] widths = rois[:, 2] - rois[:, 0] heights = rois[:, 3] - rois[:, 1] widths = widths.clamp(min=1) heights = heights.clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() num_keypoints = maps.shape[1] if torchvision._is_tracing(): xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop( maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, torch.scalar_tensor(num_keypoints, dtype=torch.int64)) return xy_preds.permute(0, 2, 1), end_scores xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device) end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device) for i in range(len(rois)): roi_map_width = int(widths_ceil[i].item()) roi_map_height = int(heights_ceil[i].item()) width_correction = widths[i] / roi_map_width height_correction = heights[i] / roi_map_height roi_map = torch.nn.functional.interpolate(maps[i][None], size=(roi_map_height, roi_map_width), mode='bicubic', align_corners=False)[0] # roi_map_probs = scores_to_probs(roi_map.copy()) w = roi_map.shape[2] pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1) x_int = pos % w y_int = (pos - x_int) // w # assert (roi_map_probs[k, y_int, x_int] == # roi_map_probs[k, :, :].max()) x = (x_int.float() + 0.5) * width_correction y = (y_int.float() + 0.5) * height_correction xy_preds[i, 0, :] = x + offset_x[i] xy_preds[i, 1, :] = y + offset_y[i] xy_preds[i, 2, :] = 1 end_scores[i, :] = roi_map[torch.arange(num_keypoints), y_int, x_int] return xy_preds.permute(0, 2, 1), end_scores
def batched_nms( boxes: Tensor, scores: Tensor, idxs: Tensor, iou_threshold: float, ) -> Tensor: """ Performs non-maximum suppression in a batched fashion. Each index value correspond to a category, and NMS will not be applied between elements of different categories. Args: boxes (Tensor[N, 4]): boxes where NMS will be performed. They are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. scores (Tensor[N]): scores for each one of the boxes idxs (Tensor[N]): indices of the categories for each one of the boxes. iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold Returns: Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(batched_nms) # Benchmarks that drove the following thresholds are at # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339 if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing(): return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold) else: return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
def grid_anchors(self, grid_sizes, strides): # type: (List[List[int]], List[List[int]]) anchors = [] cell_anchors = self.cell_anchors assert cell_anchors is not None for size, stride, base_anchors in zip( grid_sizes, strides, cell_anchors ): grid_height, grid_width = size stride_height, stride_width = stride if torchvision._is_tracing(): # required in ONNX export for mult operation with float32 stride_width = torch.tensor(stride_width, dtype=torch.float32) stride_height = torch.tensor(stride_height, dtype=torch.float32) device = base_anchors.device shifts_x = torch.arange( 0, grid_width, dtype=torch.float32, device=device ) * stride_width shifts_y = torch.arange( 0, grid_height, dtype=torch.float32, device=device ) * stride_height shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) return anchors
def clip_boxes_to_image(boxes, size): # type: (Tensor, Tuple[int, int]) """ Clip boxes so that they lie inside an image of size `size`. Arguments: boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format size (Tuple[height, width]): size of the image Returns: clipped_boxes (Tensor[N, 4]) """ dim = boxes.dim() boxes_x = boxes[..., 0::2] boxes_y = boxes[..., 1::2] height, width = size if torchvision._is_tracing(): boxes_x = torch.max( boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_x = torch.min( boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.max( boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.min( boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device)) else: boxes_x = boxes_x.clamp(min=0, max=width) boxes_y = boxes_y.clamp(min=0, max=height) clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) return clipped_boxes.reshape(boxes.shape)
def resize(self, image, target): # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] h, w = image.shape[-2:] if self.training: size = float(self.torch_choice(self.min_size)) else: # FIXME assume for now that testing uses the largest scale size = float(self.min_size[-1]) if torchvision._is_tracing(): image, target = _resize_image_and_masks_onnx(image, size, float(self.max_size), target) else: image, target = _resize_image_and_masks(image, size, float(self.max_size), target) if target is None: return image, target bbox = target["boxes"] bbox = resize_boxes(bbox, (h, w), image.shape[-2:]) target["boxes"] = bbox if "keypoints" in target: keypoints = target["keypoints"] keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:]) target["keypoints"] = keypoints return image, target
def _get_top_n_idx(self, objectness, num_anchors_per_level): # type: (Tensor, List[int]) -> Tensor """ 获取每张预测特征图上预测概率排前pre_nms_top_n的anchors索引值 Args: objectness: Tensor(每张图像的预测目标概率信息 ) num_anchors_per_level: List(每个预测特征层上的预测的anchors个数) Returns: """ r = [] # 记录每个预测特征层上预测目标概率前pre_nms_top_n的索引信息 offset = 0 # 遍历每个预测特征层上的预测目标概率信息 for ob in objectness.split(num_anchors_per_level, 1): if torchvision._is_tracing(): num_anchors, pre_nms_top_n = _onnx_get_num_anchors_and_pre_nms_top_n(ob, self.pre_nms_top_n()) else: num_anchors = ob.shape[1] # 预测特征层上的预测的anchors个数 pre_nms_top_n = min(self.pre_nms_top_n(), num_anchors) # self.pre_nms_top_n=1000 # Returns the k largest elements of the given input tensor along a given dimension _, top_n_idx = ob.topk(pre_nms_top_n, dim=1) r.append(top_n_idx + offset) offset += num_anchors return torch.cat(r, dim=1)
def _resize_image_and_masks( image: Tensor, self_min_size: float, self_max_size: float, target: Optional[Dict[str, Tensor]] = None, fixed_size: Optional[Tuple[int, int]] = None, ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: if torchvision._is_tracing(): im_shape = _get_shape_onnx(image) else: im_shape = torch.tensor(image.shape[-2:]) size: Optional[List[int]] = None scale_factor: Optional[float] = None recompute_scale_factor: Optional[bool] = None if fixed_size is not None: size = [fixed_size[1], fixed_size[0]] else: min_size = torch.min(im_shape).to(dtype=torch.float32) max_size = torch.max(im_shape).to(dtype=torch.float32) scale = torch.min(self_min_size / min_size, self_max_size / max_size) if torchvision._is_tracing(): scale_factor = _fake_cast_onnx(scale) else: scale_factor = scale.item() recompute_scale_factor = True image = torch.nn.functional.interpolate( image[None], size=size, scale_factor=scale_factor, mode="bilinear", recompute_scale_factor=recompute_scale_factor, align_corners=False, )[0] if target is None: return image, target if "masks" in target: mask = target["masks"] mask = torch.nn.functional.interpolate( mask[:, None].float(), size=size, scale_factor=scale_factor, recompute_scale_factor=recompute_scale_factor )[:, 0].byte() target["masks"] = mask return image, target
def expand_masks(mask, padding): M = mask.shape[-1] if torchvision._is_tracing(): scale = (M + 2 * padding).to(torch.float32) / M.to(torch.float32) else: scale = float(M + 2 * padding) / M padded_mask = torch.nn.functional.pad(mask, (padding, ) * 4) return padded_mask, scale
def forward( self, inputs: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None, ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: # get the original image sizes original_image_sizes: List[Tuple[int, int]] = [] if not self.training: for img in inputs: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) # Transform the input samples, targets = self.transform(inputs, targets) # Compute the detections outputs = self.model(samples.tensors, targets=targets) losses = {} detections: List[Dict[str, Tensor]] = [] if self.training: # compute the losses if torch.jit.is_scripting(): losses = outputs[0] else: losses = outputs else: # Rescale coordinate if torch.jit.is_scripting(): result = outputs[1] else: result = outputs if torchvision._is_tracing(): im_shape = _get_shape_onnx(samples.tensors) else: im_shape = torch.tensor(samples.tensors.shape[-2:]) detections = self.transform.postprocess(result, im_shape, original_image_sizes) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn( "YOLOv5 always returns a (Losses, Detections) tuple in scripting." ) self._has_warned = True return losses, detections else: return self.eager_outputs(losses, detections)
def _get_top_n_idx(self, objectness, num_anchors_per_level): r = [] offset = 0 for ob in objectness.split(num_anchors_per_level, 1): if torchvision._is_tracing(): num_anchors, pre_nms_top_n = _onnx_get_num_anchors_and_pre_nms_top_n( ob, self.pre_nms_top_n) else: num_anchors = ob.shape[1] pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) _, top_n_idx = ob.topk(pre_nms_top_n, dim=1) r.append(top_n_idx + offset) offset += num_anchors return torch.cat(r, dim=1)
def expand_boxes(boxes, scale): if torchvision._is_tracing(): return _onnx_expand_boxes(boxes, scale) w_half = (boxes[:, 2] - boxes[:, 0]) * .5 h_half = (boxes[:, 3] - boxes[:, 1]) * .5 x_c = (boxes[:, 2] + boxes[:, 0]) * .5 y_c = (boxes[:, 3] + boxes[:, 1]) * .5 w_half *= scale h_half *= scale boxes_exp = torch.zeros_like(boxes) boxes_exp[:, 0] = x_c - w_half boxes_exp[:, 2] = x_c + w_half boxes_exp[:, 1] = y_c - h_half boxes_exp[:, 3] = y_c + h_half return boxes_exp
def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level): # type: (Tensor, Tensor, List[Tuple[int, int]], List[int]) num_images = proposals.shape[0] device = proposals.device # do not backprop throught objectness objectness = objectness.detach() objectness = objectness.reshape(num_images, -1) levels = [ torch.full((n, ), idx, dtype=torch.int64, device=device) for idx, n in enumerate(num_anchors_per_level) ] levels = torch.cat(levels, 0) levels = levels.reshape(1, -1).expand_as(objectness) # select top_n boxes independently per level before applying nms top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level) image_range = torch.arange(num_images, device=device) batch_idx = image_range[:, None] objectness = objectness[batch_idx, top_n_idx] levels = levels[batch_idx, top_n_idx] proposals = proposals[batch_idx, top_n_idx] final_boxes = [] final_scores = [] for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes): # For onnx export, Clip's min max can not be traced as tensor. if torchvision._is_tracing(): boxes = _onnx_clip_boxes_to_image(boxes, img_shape) else: boxes = box_ops.clip_boxes_to_image(boxes, img_shape) keep = box_ops.remove_small_boxes(boxes, self.min_size) boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep] # non-maximum suppression, independently done per level keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh) # keep only topk scoring predictions keep = keep[:self.post_nms_top_n()] boxes, scores = boxes[keep], scores[keep] final_boxes.append(boxes) final_scores.append(scores) return final_boxes, final_scores
def paste_masks_in_image(masks, boxes, img_shape, padding=1): # type: (Tensor, Tensor, Tuple[int, int], int) masks, scale = expand_masks(masks, padding=padding) boxes = expand_boxes(boxes, scale).to(dtype=torch.int64) im_h, im_w = img_shape if torchvision._is_tracing(): return _onnx_paste_masks_in_image_loop( masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64))[:, None] res = [ paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes) ] if len(res) > 0: ret = torch.stack(res, dim=0)[:, None] else: ret = masks.new_empty((0, 1, im_h, im_w)) return ret
def batch_images(self, images, size_divisible=32): # type: (List[Tensor], int) -> Tensor """ 将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的) 还要对图片的大小进行放缩,搞成同一大小 过程:将一个batch内的所有图片获取最大高度和最大宽度 将这两个数据作为标准 其他图片不够的区域补0 目的,保持原来图片的比例,(因为0不影响目标的识别) Args: images: 输入的一批图片 size_divisible: 将图像高和宽调整到该数的整数倍 Returns: batched_imgs: 打包成一个batch后的tensor数据 """ if torchvision._is_tracing(): # batch_images() does not export well to ONNX # call _onnx_batch_images() instead return self._onnx_batch_images(images, size_divisible) # 分别计算一个batch中所有图片中的最大channel, height, width max_size = self.max_by_axis([list(img.shape) for img in images]) stride = float(size_divisible) # max_size = list(max_size) # 将height向上调整到stride的整数倍 max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) # 将width向上调整到stride的整数倍 max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) # [batch, channel, height, width] batch_shape = [len(images)] + max_size # 创建shape为batch_shape且值全部为0的tensor batched_imgs = images[0].new_full(batch_shape, 0) for img, pad_img in zip(images, batched_imgs): # 将输入images中的每张图片复制到新的batched_imgs的每张图片中,对齐左上角,保证bboxes的坐标不变 # 这样保证输入到网络中一个batch的每张图片的shape相同 # copy_: Copies the elements from src into self tensor and returns self pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) return batched_imgs
def batch_images(self, images, size_divisible=32): # type: (List[Tensor], int) -> Tensor if torchvision._is_tracing(): # batch_images() does not export well to ONNX # call _onnx_batch_images() instead return self._onnx_batch_images(images, size_divisible) max_size = self.max_by_axis([list(img.shape) for img in images]) stride = float(size_divisible) max_size = list(max_size) max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) batch_shape = [len(images)] + max_size batched_imgs = images[0].new_full(batch_shape, 0) for img, pad_img in zip(images, batched_imgs): pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) return batched_imgs
def batch_images(self, images, size_divisible=32): if torchvision._is_tracing(): # batch_images() does not export well to ONNX # call _onnx_batch_images() instead return self._onnx_batch_images(images, size_divisible) max_size = tuple(max(s) for s in zip(*[img.shape for img in images])) stride = size_divisible max_size = list(max_size) max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) max_size = tuple(max_size) batch_shape = (len(images), ) + max_size batched_imgs = images[0].new(*batch_shape).zero_() for img, pad_img in zip(images, batched_imgs): pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) return batched_imgs
def batch_images(self, images, size_divisible=32): # type: (List[Tensor], int) """ 将一批图像打包成一个batch返回(注意batch中每个tensor的shape是相同的) Args: images: 输入的一批图片 size_divisible: 将图像高和宽调整到该数的整数倍 Returns: batched_imgs: 打包成一个batch后的tensor数据 """ if torchvision._is_tracing(): # batch_images() does not export well to ONNX # call _onnx_batch_images() instead return self._onnx_batch_images(images, size_divisible) # 分别计算一个batch中所有图片中的最大height, width max_size = self.max_by_axis([list(img.shape) for img in images]) stride = float(size_divisible) # max_size = list(max_size) # 将height向上调整到stride的整数倍 max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) # 将width向上调整到stride的整数倍 max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) # [batch, channel, height, width] batch_shape = [len(images)] + max_size # 创建shape为batch_shape且值全部为0的tensor # images[0]就是一个tensor 为了调用tensor的new_full方法 返回全0的shape为batch_shape的tensor batched_imgs = images[0].new_full(batch_shape, 0) for img, pad_img in zip(images, batched_imgs): # 将输入images中的每张图片复制到新的batched_imgs的每张图片中,对齐左上角,保证bboxes的坐标不变 # 这样保证输入到网络中一个batch的每张图片的shape相同 # copy_: Copies the elements from src into self tensor and returns self # 把img的像素值复制到pad_img的相同位置处 pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) return batched_imgs
def nestedtensorfromtensorlist(self, tensor_list): def maxbyaxis(the_list): maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes assert tensor_list[0].ndim == 3 if torchvision._is_tracing(): return self.onnxnestedtensorfromtensorlist(tensor_list) max_size = maxbyaxis([list(img.shape) for img in tensor_list]) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype, device = tensor_list[0].dtype, tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) m[:img.shape[1], :img.shape[2]] = False return NestedTensor(tensor, mask)
def anchor_generator_forward_patch(self, image_list_tensors, image_list_image_sizes, feature_maps): if torchvision._is_tracing(): from torch.onnx import operators grid_sizes = list([ operators.shape_as_tensor(feature_map)[-2:] for feature_map in feature_maps ]) image_size = operators.shape_as_tensor(image_list_tensors)[-2:] strides = [image_size / g for g in grid_sizes] else: grid_sizes = list( [feature_map.shape[-2:] for feature_map in feature_maps]) image_size = image_list_tensors.shape[-2:] strides = [[int(image_size[0] / g[0]), int(image_size[1] / g[1])] for g in grid_sizes] # TracerWarning: Converting a tensor to a Python integer dtype, device = feature_maps[0].dtype, feature_maps[0].device self.set_cell_anchors(dtype, device) # return self.cell_anchors # Ignore cache first because when we exporting, we only run one batch # anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides) anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides) # return anchors_over_all_feature_maps anchors = torch.jit.annotate(List[List[torch.Tensor]], []) # for i, (image_height, image_width) in enumerate(image_list.image_sizes): # num of images is constant?? loop over a dimension, N, so N can not be dynamic dimension for hw in image_list_image_sizes: anchors_in_image = [] for anchors_per_feature_map in anchors_over_all_feature_maps: anchors_in_image.append(anchors_per_feature_map) anchors.append(anchors_in_image) anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors] return anchors
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], :img.shape[2]] = False else: raise ValueError('not supported') return NestedTensor(tensor, mask)
def nested_tensor_from_tensor_list(tensor_list: List[Tensor], size_divisible: int = 32): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list( tensor_list, size_divisible) max_size = _max_by_axis([list(img.shape) for img in tensor_list]) stride = float(size_divisible) max_size = list(max_size) max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) batch_shape = [len(tensor_list)] + max_size tensor_batched = tensor_list[0].new_full(batch_shape, 0) for img, pad_img in zip(tensor_list, tensor_batched): pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) else: raise ValueError('not supported') return tensor_batched
def resize(self, image, target): # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] """ 将图片缩放到指定的大小范围内,并对应缩放bboxes信息 Args: image: 输入的图片 target: 输入图片的相关信息(包括bboxes信息) Returns: image: 缩放后的图片 target: 缩放bboxes后的图片相关信息 """ # image shape is [channel, height, width] h, w = image.shape[-2:] if self.training: size = float(self.torch_choice( self.min_size)) # 指定输入图片的最小边长,注意是self.min_size不是min_size else: # FIXME assume for now that testing uses the largest scale size = float( self.min_size[-1]) # 指定输入图片的最小边长,注意是self.min_size不是min_size if torchvision._is_tracing(): image = _resize_image_onnx(image, size, float(self.max_size)) else: image = _resize_image(image, size, float(self.max_size)) if target is None: return image, target bbox = target["boxes"] # 根据图像的缩放比例来缩放bbox bbox = resize_boxes(bbox, [h, w], image.shape[-2:]) target["boxes"] = bbox return image, target
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor: """ Clip boxes so that they lie inside an image of size `size`. Args: boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. size (Tuple[height, width]): size of the image Returns: Tensor[N, 4]: clipped boxes """ if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(clip_boxes_to_image) dim = boxes.dim() boxes_x = boxes[..., 0::2] boxes_y = boxes[..., 1::2] height, width = size if torchvision._is_tracing(): boxes_x = torch.max( boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_x = torch.min( boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.max( boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) boxes_y = torch.min( boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device)) else: boxes_x = boxes_x.clamp(min=0, max=width) boxes_y = boxes_y.clamp(min=0, max=height) clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) return clipped_boxes.reshape(boxes.shape)
def grid_anchors(self, grid_sizes, strides): # type: (List[List[int]], List[List[int]]) anchors = [] cell_anchors = self.cell_anchors assert cell_anchors is not None for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors): grid_height, grid_width = size stride_height, stride_width = stride if torchvision._is_tracing(): # required in ONNX export for mult operation with float32 stride_width = torch.tensor(stride_width, dtype=torch.float32) stride_height = torch.tensor(stride_height, dtype=torch.float32) device = base_anchors.device # For output anchor, compute [x_center, y_center, x_center, y_center] shifts_x = torch.arange( 0, grid_width, dtype=torch.float32, device=device) * stride_width shifts_y = torch.arange( 0, grid_height, dtype=torch.float32, device=device) * stride_height shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) # For every (base anchor, output anchor) pair, # offset each zero-centered base anchor by the center of the output anchor. anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape( -1, 4)) return anchors
def forward(self, x, boxes, image_shapes): # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) -> Tensor """ Arguments: x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. Returns: result (Tensor) """ x_filtered = [] for k, v in x.items(): if k in self.featmap_names: x_filtered.append(v) num_levels = len(x_filtered) rois = self.convert_to_roi_format(boxes) if self.scales is None: self.setup_scales(x_filtered, image_shapes) scales = self.scales assert scales is not None if num_levels == 1: return roi_align(x_filtered[0], rois, output_size=self.output_size, spatial_scale=scales[0], sampling_ratio=self.sampling_ratio) mapper = self.map_levels assert mapper is not None levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + self.output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.nonzero(levels == level).squeeze(1) rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align(per_level_feature, rois_per_level, output_size=self.output_size, spatial_scale=scale, sampling_ratio=self.sampling_ratio) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: result[idx_in_level] = result_idx_in_level if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result
def forward( self, x: Dict[str, Tensor], boxes: List[Tensor], image_shapes: List[Tuple[int, int]], ) -> Tensor: """ Args: x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. Returns: result (Tensor) """ x_filtered = [] for k, v in x.items(): if k in self.featmap_names: x_filtered.append(v) num_levels = len(x_filtered) rois = self.convert_to_roi_format(boxes) if self.scales is None: self.setup_scales(x_filtered, image_shapes) scales = self.scales assert scales is not None if num_levels == 1: return roi_align(x_filtered[0], rois, output_size=self.output_size, spatial_scale=scales[0], sampling_ratio=self.sampling_ratio) mapper = self.map_levels assert mapper is not None levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + self.output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.where(levels == level)[0] rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align(per_level_feature, rois_per_level, output_size=self.output_size, spatial_scale=scale, sampling_ratio=self.sampling_ratio) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: # result and result_idx_in_level's dtypes are based on dtypes of different # elements in x_filtered. x_filtered contains tensors output by different # layers. When autocast is active, it may choose different dtypes for # different layers' outputs. Therefore, we defensively match result's dtype # before copying elements from result_idx_in_level in the following op. # We need to cast manually (can't rely on autocast to cast for us) because # the op acts on result in-place, and autocast only affects out-of-place ops. result[idx_in_level] = result_idx_in_level.to(result.dtype) if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result
def _multiscale_roi_align( x_filtered: List[Tensor], boxes: List[Tensor], output_size: List[int], sampling_ratio: int, scales: Optional[List[float]], mapper: Optional[LevelMapper], ) -> Tensor: """ Args: x_filtered (List[Tensor]): List of input tensors. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. output_size (Union[List[Tuple[int, int]], List[int]]): size of the output sampling_ratio (int): sampling ratio for ROIAlign scales (Optional[List[float]]): If None, scales will be automatically infered. Default value is None. mapper (Optional[LevelMapper]): If none, mapper will be automatically infered. Default value is None. Returns: result (Tensor) """ assert scales is not None assert mapper is not None num_levels = len(x_filtered) rois = _convert_to_roi_format(boxes) if num_levels == 1: return roi_align( x_filtered[0], rois, output_size=output_size, spatial_scale=scales[0], sampling_ratio=sampling_ratio, ) levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.where(levels == level)[0] rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align( per_level_feature, rois_per_level, output_size=output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, ) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: # result and result_idx_in_level's dtypes are based on dtypes of different # elements in x_filtered. x_filtered contains tensors output by different # layers. When autocast is active, it may choose different dtypes for # different layers' outputs. Therefore, we defensively match result's dtype # before copying elements from result_idx_in_level in the following op. # We need to cast manually (can't rely on autocast to cast for us) because # the op acts on result in-place, and autocast only affects out-of-place ops. result[idx_in_level] = result_idx_in_level.to(result.dtype) if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result
def forward(self, images, features, targets=None): # type: (ImageList, Dict[str, Tensor], Optional[List[Dict[str, Tensor]]]) """ Arguments: images (ImageList): images for which we want to compute the predictions features (List[Tensor]): features computed from the images that are used for computing the predictions. Each tensor in the list correspond to different feature levels targets (List[Dict[Tensor]]): ground-truth boxes present in the image (optional). If provided, each element in the dict should contain a field `boxes`, with the locations of the ground-truth boxes. Returns: boxes (List[Tensor]): the predicted boxes from the RPN, one Tensor per image. losses (Dict[Tensor]): the losses for the model during training. During testing, it is an empty dict. """ # RPN uses all feature maps that are available features = list(features.values()) objectness, pred_bbox_deltas = self.head(features) anchors = self.anchor_generator(images, features) num_images = len(anchors) if torchvision._is_tracing(): # For onnx export(Split in _get_top_n_idx) from torch.onnx.operators import shape_as_tensor num_anchors_per_level_shape_tensors = [ shape_as_tensor(o[0]) for o in objectness ] num_anchors_per_level = [ s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors ] # tensor.prod() => ReduceProd. ReduceProd can not be run by current runtime. # This is a above is a naive WAR else: num_anchors_per_level = [o[0].numel() for o in objectness] objectness, pred_bbox_deltas = \ concat_box_prediction_layers(objectness, pred_bbox_deltas) # apply pred_bbox_deltas to anchors to obtain the decoded proposals # note that we detach the deltas because Faster R-CNN do not backprop through # the proposals proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors) proposals = proposals.view(num_images, -1, 4) boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level) losses = {} if self.training: assert targets is not None labels, matched_gt_boxes = self.assign_targets_to_anchors( anchors, targets) regression_targets = self.box_coder.encode(matched_gt_boxes, anchors) loss_objectness, loss_rpn_box_reg = self.compute_loss( objectness, pred_bbox_deltas, labels, regression_targets) losses = { "loss_objectness": loss_objectness, "loss_rpn_box_reg": loss_rpn_box_reg, } return boxes, losses
def forward(self, x, boxes, image_shapes): # type: (Dict[str, Tensor], List[Tensor], List[Tuple[int, int]]) """ Arguments: x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have all the same number of channels, but they can have different sizes. boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in (x1, y1, x2, y2) format and in the image reference size, not the feature map reference. image_shapes (List[Tuple[height, width]]): the sizes of each image before they have been fed to a CNN to obtain feature maps. This allows us to infer the scale factor for each one of the levels to be pooled. Returns: result (Tensor) """ x_filtered = [] for k, v in x.items(): if k in self.featmap_names: x_filtered.append(v) num_levels = len(x_filtered) rois = self.convert_to_roi_format(boxes) if self.scales is None: self.setup_scales(x_filtered, image_shapes) scales = self.scales assert scales is not None if num_levels == 1: # return roi_align( # x_filtered[0], rois, # output_size=self.output_size, # spatial_scale=scales[0], # sampling_ratio=self.sampling_ratio # ) feature_num = x_filtered[0].size(1) // 6 total_roi_feature = torch.zeros( rois.size(0), feature_num, self.output_size[0], self.output_size[1]).to(rois.device) # Change rois from world coordinates to image coordinates image_center = 200. points, points_index = get_view_point(rois[:, 1:].cpu()) for view in points.keys(): if len(points[view]) != 0: min_x, max_x, min_y, max_y = world_to_image( torch.stack(points[view], dim=0), view, image_center=image_center) min_x = (min_x * 400 / 306).clip(min=0, max=400) max_x = (max_x * 400 / 306).clip(min=0, max=400) min_y = (min_y * 400 / 256).clip(min=0, max=400) max_y = (max_y * 400 / 256).clip(min=0, max=400) rois = torch.stack( (torch.zeros(len(min_x)).float().cuda(), torch.from_numpy(min_x).float().cuda(), torch.from_numpy(min_y).float().cuda(), torch.from_numpy(max_x).float().cuda(), torch.from_numpy(max_y).float().cuda()), dim=1) total_roi_feature[points_index[view]] = roi_align( x_filtered[0][:, img_index[view] * feature_num:(img_index[view] + 1) * feature_num], rois, output_size=self.output_size, spatial_scale=scales[0], sampling_ratio=self.sampling_ratio) return total_roi_feature mapper = self.map_levels assert mapper is not None levels = mapper(boxes) num_rois = len(rois) num_channels = x_filtered[0].shape[1] dtype, device = x_filtered[0].dtype, x_filtered[0].device result = torch.zeros( ( num_rois, num_channels, ) + self.output_size, dtype=dtype, device=device, ) tracing_results = [] for level, (per_level_feature, scale) in enumerate(zip(x_filtered, scales)): idx_in_level = torch.nonzero(levels == level).squeeze(1) rois_per_level = rois[idx_in_level] result_idx_in_level = roi_align(per_level_feature, rois_per_level, output_size=self.output_size, spatial_scale=scale, sampling_ratio=self.sampling_ratio) if torchvision._is_tracing(): tracing_results.append(result_idx_in_level.to(dtype)) else: result[idx_in_level] = result_idx_in_level if torchvision._is_tracing(): result = _onnx_merge_levels(levels, tracing_results) return result