def __getitem__(self, idx): img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size) target.add_field("masks", masks) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx
def update(self, dets: BoxList): W, H = dets.size assert dets.mode == 'xyxy' trks = np.zeros((len(self.trackers), 4), dtype=np.float32) to_del = [] for t, trk in enumerate(trks): pos = self.trackers[t].predict() trk[:] = pos if np.any(np.isnan(pos)): to_del.append(t) trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) for t in reversed(to_del): self.trackers.pop(t) matched, unmatched_dets, unmatched_trks = associate(dets.bbox.numpy(), trks) dboxes, dmeta = dets.bbox, dets.extra_fields per_det_meta = [{k: v[i] for k, v in dmeta.items()} for i in range(len(dets))] # update matched trackers with assigned detections for t, trk in enumerate(self.trackers): if t not in unmatched_trks: bbox = matched[np.where(matched[:, 1] == t)[0], 0][0] trk.update(dboxes[bbox], per_det_meta[bbox]) # create and initialise new trackers for unmatched detections for i in unmatched_dets: trk = KalmanTracker(dboxes[i], per_det_meta[i], int(self.max_age * 0.5), W, H) self.trackers.append(trk) i = len(self.trackers) detections = {"bbox": [], "index": []} detections.update({k: [] for k in dets.extra_fields.keys()}) for trk in reversed(self.trackers): if trk.time_since_update <= self.max_age and trk.hits >= self.min_hits: bbox, meta = trk.get_state() meta['index'] = trk.id detections["bbox"].append(bbox) for k, v in meta.items(): detections[k].append(v) i -= 1 # remove dead tracklet if trk.time_since_update > self.max_age: self.trackers.pop(i) if len(detections['bbox']) == 0: return None detections['bbox'] = torch.tensor(detections['bbox'], dtype=torch.float32) box_list = BoxList(detections['bbox'], (W, H)) for k, v in detections.items(): if k != 'bbox': if isinstance(v[0], torch.Tensor) and v[0].dim() != 0: box_list.add_field(k, torch.cat(v)) else: box_list.add_field(k, torch.tensor(v)) return box_list
def evaluate_box_proposals( predictions, dataset, thresholds=None, area="all", limit=None ): """Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0 ** 2, 1e5 ** 2], # all [0 ** 2, 32 ** 2], # small [32 ** 2, 96 ** 2], # medium [96 ** 2, 1e5 ** 2], # large [96 ** 2, 128 ** 2], # 96-128 [128 ** 2, 256 ** 2], # 128-256 [256 ** 2, 512 ** 2], # 256-512 [512 ** 2, 1e5 ** 2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = prediction.get_field("objectness").sort(descending=True)[1] prediction = prediction[inds] ann_ids = dataset.coco.getAnnIds(imgIds=original_id) anno = dataset.coco.loadAnns(ann_ids) gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( "xyxy" ) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if len(prediction) == 0: continue if limit is not None and len(prediction) > limit: prediction = prediction[:limit] overlaps = boxlist_iou(prediction, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(prediction), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = torch.cat(gt_overlaps, dim=0) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, }
def __getitem__(self, idx): img, anno = super(WordDataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] if DEBUG: print('len(boxes)', len(boxes), boxes[0]) boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] if DEBUG: print('len(classes)', len(classes), classes[0]) classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) masks = [obj["segmentation"] for obj in anno] if DEBUG: print('len(masks)', len(masks), masks[0]) masks = SegmentationMask(masks, img.size) target.add_field("masks", masks) if anno and 'keypoints' in anno[0]: kes = [obj["keypoints"] for obj in anno] kes = self.kes_gen(kes) if DEBUG: print('len(kes)', len(kes), kes[0]) kes = textKES(kes, img.size) target.add_field("kes", kes) if anno and 'match_type' in anno[0]: mty = [obj["match_type"] for obj in anno] mty = MTY(mty, img.size) target.add_field("mty", mty) target = target.clip_to_image(remove_empty=True) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx
def forward_for_single_feature_map(self, anchors, box_cls, box_regression): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ device = box_cls.device N, _, H, W = box_cls.shape A = box_regression.size(1) // 4 C = box_cls.size(1) // A # put in the same format as anchors box_cls = permute_and_flatten(box_cls, N, A, C, H, W) box_cls = box_cls.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) box_regression = box_regression.reshape(N, -1, 4) num_anchors = A * H * W candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) results = [] for per_box_cls, per_box_regression, per_pre_nms_top_n, \ per_candidate_inds, per_anchors in zip( box_cls, box_regression, pre_nms_top_n, candidate_inds, anchors): # Sort and select TopN # TODO most of this can be made out of the loop for # all images. # TODO:Yang: Not easy to do. Because the numbers of detections are # different in each image. Therefore, this part needs to be done # per image. per_box_cls = per_box_cls[per_candidate_inds] per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_candidate_nonzeros = \ per_candidate_inds.nonzero()[top_k_indices, :] per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_class += 1 detections = self.box_coder.decode( per_box_regression[per_box_loc, :].view(-1, 4), per_anchors.bbox[per_box_loc, :].view(-1, 4)) boxlist = BoxList(detections, per_anchors.size, mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results.append(boxlist) return results
def get_groundtruth(self, idx): img = self.idx_to_img[idx] boxes = self.detail.getBboxes(img) # example of 'boxes': # [{'bbox': [250, 209, 241, 149], 'category': 'motorbike'}, # {'bbox': [312, 139, 109, 191], 'category': 'person'}] boxes = [box['bbox'] for box in boxes ] # TODO gubimy informację o otoczonym przedmiocie boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, self._img_size(img), mode="xywh").convert("xyxy") target = target.clip_to_image(remove_empty=True) img_keypoints = self.detail.getKpts(img) keypoints = [skelton['keypoints'] for skelton in img_keypoints] # TODO keypoints - gubimy informację o bbox target.add_field("kpts", Keypoints(keypoints, self._img_size(img))) # target.add_field("mask", SegmentationMask(self.detail.getMask(img).tolist(), size=self._img_size(img))) # TODO getMask zwraca macierz rozmiaru (img.height, img.width), gdzie każdemu pikselowi # TODO odpowiada numer id klasy, do której należy. SegmentationMask # from getMask() doc: # If semantic segmentation of an image is requested (cat=instance=superpart=part=None), # the result is an image whose pixel values are the class IDs for that image. # If instance-level segmentation for one category of an image is requested (img and cat provided), # the result is an image whose pixel values are the instance IDs for that class and 0 everywhere else. target.add_field("class_mask", self.detail.getMask(img)) target.add_field("instance_mask", self.detail.getMask(img, cat='person')) target.add_field("bounds", self.detail.getBounds(img)) target.add_field("occl", self.detail.getOccl(img)) # TODO human parts? return target
def __getitem__(self, item): im_name = os.path.basename(self.image_lists[item]) # print(self.image_lists[item]) img = Image.open(self.image_lists[item]).convert("RGB") width, height = img.size if self.gts_dir is not None: gt_path = os.path.join(self.gts_dir, im_name + ".txt") words, boxes, charsbbs, segmentations, labels = self.load_gt_from_txt( gt_path, height, width) if words[0] == "": use_char_ann = False else: use_char_ann = True if not self.use_charann: use_char_ann = False target = BoxList(boxes[:, :4], img.size, mode="xyxy", use_char_ann=use_char_ann) if self.ignore_difficult: labels = torch.from_numpy(np.array(labels)) else: labels = torch.ones(len(boxes)) target.add_field("labels", labels) masks = SegmentationMask(segmentations, img.size) target.add_field("masks", masks) char_masks = SegmentationCharMask(charsbbs, words=words, use_char_ann=use_char_ann, size=img.size) target.add_field("char_masks", char_masks) else: target = None if self.transforms is not None: img, target = self.transforms(img, target) if self.vis: new_im = img.numpy().copy().transpose([1, 2, 0]) + [ 102.9801, 115.9465, 122.7717, ] new_im = Image.fromarray(new_im.astype(np.uint8)).convert("RGB") mask = target.extra_fields["masks"].polygons[0].convert("mask") mask = Image.fromarray( (mask.numpy() * 255).astype(np.uint8)).convert("RGB") if self.use_charann: m, _ = (target.extra_fields["char_masks"].chars_boxes[0]. convert("char_mask")) color = self.creat_color_map(37, 255) color_map = color[m.numpy().astype(np.uint8)] char = Image.fromarray(color_map.astype( np.uint8)).convert("RGB") char = Image.blend(char, new_im, 0.5) else: char = new_im new = Image.blend(char, mask, 0.5) img_draw = ImageDraw.Draw(new) for box in target.bbox.numpy(): box = list(box) box = box[:2] + [box[2], box[1]] + box[2:] + [box[0], box[3] ] + box[:2] img_draw.line(box, fill=(255, 0, 0), width=2) new.save("./vis/char_" + im_name) return img, target, self.image_lists[item]
def select_over_all_levels(self, boxlists): num_images = len(boxlists) results = [] if cfg.ROTATE: for i in range(num_images): scores = boxlists[i].get_field("scores") labels = boxlists[i].get_field("labels") boxes = boxlists[i].bbox boxlist = boxlists[i] result = [] # skip the background for j in range(1, self.num_classes): inds = (labels == j).nonzero().view(-1) scores_j = scores[inds] boxes_j = boxes[inds, :].view(-1, 8) boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xy8") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_rnms(boxlist_for_class, self.nms_thresh, score_field="scores") num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels, ), j, dtype=torch.int64, device=scores.device)) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) else: for i in range(num_images): scores = boxlists[i].get_field("scores") labels = boxlists[i].get_field("labels") boxes = boxlists[i].bbox boxlist = boxlists[i] result = [] # skip the background for j in range(1, self.num_classes): inds = (labels == j).nonzero().view(-1) scores_j = scores[inds] boxes_j = boxes[inds, :].view(-1, 4) boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms(boxlist_for_class, self.nms_thresh, score_field="scores") num_labels = len(boxlist_for_class) boxlist_for_class.add_field( "labels", torch.full((num_labels, ), j, dtype=torch.int64, device=scores.device)) result.append(boxlist_for_class) result = cat_boxlist(result) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) return results
def forward_for_single_feature_map(self, anchors, box_cls, box_regression): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ device = box_cls.device N, _, H, W = box_cls.shape A = int(box_regression.size(1) / 4) C = int(box_cls.size(1) / A) # put in the same format as anchors box_cls = box_cls.view(N, -1, C, H, W).permute(0, 3, 4, 1, 2) box_cls = box_cls.reshape(N, -1, C) box_cls = box_cls.sigmoid() box_regression = box_regression.view(N, -1, 4, H, W) box_regression = box_regression.permute(0, 3, 4, 1, 2) box_regression = box_regression.reshape(N, -1, 4) num_anchors = A * H * W results = [[] for _ in range(N)] pre_nms_thresh = self.pre_nms_thresh candidate_inds = box_cls > self.pre_nms_thresh if candidate_inds.sum().item() == 0: return results pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) for batch_idx, (per_box_cls, per_box_regression, per_pre_nms_top_n, \ per_candidate_inds, per_anchors) in enumerate(zip( box_cls, box_regression, pre_nms_top_n, candidate_inds, anchors)): # Sort and select TopN per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_class += 1 if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_box_loc = per_box_loc[top_k_indices] per_class = per_class[top_k_indices] detections = self.box_coder.decode( per_box_regression[per_box_loc, :].view(-1, 4), per_anchors.bbox[per_box_loc, :].view(-1, 4)) boxlist = BoxList(detections, per_anchors.size, mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist.add_field("sparse_off", per_box_loc / 9) boxlist.add_field("sparse_anchor_idx", per_box_loc % 9) boxlist.add_field("sparse_anchors", per_anchors.bbox[per_box_loc, :].view(-1, 4)) boxlist.add_field("sparse_batch", per_box_loc.clone().fill_(batch_idx)) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results[batch_idx] = boxlist return results
def __getitem__(self, idx): img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) if anno and "segmentation" in anno[0]: # Get dictionary of part segmentations for each object segmentations = [obj["segmentation"] for obj in anno] # Accumulate classes partclasses = [] for i in range(len(segmentations)): partclass = [obj['class'] for obj in segmentations[i]] partclass = [ self.json_partcategory_id_to_contiguous_id[p] for p in partclass ] partclass = torch.tensor(partclass) partclasses.append(partclass) # Accumulate masks masks = [] for i in range(len(segmentations)): mask = [obj["segment"] for obj in segmentations[i]] mask = SegmentationMask(mask, img.size, mode='poly') masks.append(mask) # Merge all masks belonging to the same part class new_masks = [] for msk, pcls in zip(masks, partclasses): segments = msk.get_mask_tensor() new_segments = torch.zeros( (len(self.partcategories), segments.size()[1], segments.size(2)), dtype=torch.uint8) for partcat in range(len(self.partcategories)): for n_poly in range(pcls.size()[0]): if int(pcls[n_poly]) == (partcat + 1): new_segments[partcat, :, :] = new_segments[ partcat, :, :] | segments[n_poly, :, :] new_mask = SegmentationMask(new_segments, img.size, mode='mask') new_masks.append(new_mask) new_partclass = [ self.json_partcategory_id_to_contiguous_id[p + 1] for p in range(len(self.partcategories)) ] new_partclass = torch.tensor(new_partclass) new_partclasses = [] for a in range(len(new_masks)): new_partclasses.append(new_partclass) target.add_field('partlabels', new_partclasses) target.add_field('masks', new_masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = CarKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) target = target.clip_to_image(remove_empty=True) if self._transforms is not None: img, target = self._transforms(img, target) return img, target, idx
def agmap_coarse(gt_boxlist, l_boxlist, class_independ=False, keep_small=True, verbose=False): """ 利用真实值和低分辨率检测结果,生成agmap :param gt_boxlist (BoxList): 真实目标框,必须是xyxy类型 :param l_boxlist (BoxList): 低分辨率检测结果,必须是xyxy类型 :param class_independ (bool): 是否类别无关,只考虑proposal之间的iou :param keep_small (bool): 只计算小目标(< 96x96)的agmap :return: agmap (np.ndarray) """ # 是否去除大目标,只计算96x96以下目标的agmap if keep_small: gt_area = gt_boxlist.area() l_area = l_boxlist.area() gt_keep, l_keep = gt_area < np.square(96), l_area < np.square(96) if torch.sum(gt_keep) == 0: gt_boxlist = BoxList([[0, 0, 0, 0]], gt_boxlist.size, mode="xyxy") gt_boxlist.add_field("labels", torch.as_tensor([0], dtype=torch.int64)) else: gt_boxlist = gt_boxlist[gt_keep] if torch.sum(l_keep) == 0: l_boxlist = BoxList([[1, 1, 1, 1]], l_boxlist.size, mode="xyxy") l_boxlist.add_field("labels", torch.as_tensor([0], dtype=torch.int64)) l_boxlist.add_field("scores", torch.as_tensor([0], dtype=torch.float32)) else: l_boxlist = l_boxlist[l_keep] # 初始化agmap gt_w, gt_h = gt_boxlist.size agmap = np.zeros((2, gt_h, gt_w), np.float32) for i in range(len(gt_boxlist)): g_bbox_i = gt_boxlist[i] g_label = g_bbox_i.get_field("labels").item() if class_independ: # 是否类别无关 l_boxlist_sel = l_boxlist else: l_boxlist_sel = l_boxlist[l_boxlist.get_field("labels") == g_label] # 正确召回的类别 if len(l_boxlist_sel) == 0: l_boxlist_sel = BoxList([[1, 1, 1, 1]], l_boxlist_sel.size, mode="xyxy") l_boxlist_sel.add_field( "scores", torch.as_tensor([0], dtype=torch.float32)) l_score = l_boxlist_sel.get_field("scores").cpu().numpy() iou_l = boxlist_iou(g_bbox_i, l_boxlist_sel) l_val, l_id = iou_l.max(dim=1) l_val, l_id = l_val.item(), l_id.item() # g_bbox_i只有一个元素 g_bbox = g_bbox_i.bbox[0, :].cpu().numpy() g_bbox = np.round(g_bbox).astype(np.int64) # 取整,以便索引 g_area = (g_bbox[3] - g_bbox[1]) * (g_bbox[2] - g_bbox[0]) l_bbox = l_boxlist_sel.bbox[l_id, :].cpu().numpy() l_bbox = np.round(l_bbox).astype(np.int64) l_area = (l_bbox[3] - l_bbox[1]) * (l_bbox[2] - l_bbox[0]) if l_val > iou_thrs and g_area != 0: agmap[0, g_bbox[1]:g_bbox[3], g_bbox[0]:g_bbox[2]] += (1 - l_score[l_id]) / g_area elif g_area != 0: agmap[0, g_bbox[1]:g_bbox[3], g_bbox[0]:g_bbox[2]] += 1. / g_area iou_l = boxlist_iou(gt_boxlist, l_boxlist) l_score = l_boxlist.get_field("scores").cpu().numpy() l_label = l_boxlist.get_field("labels").cpu().numpy() g_label = gt_boxlist.get_field("labels").cpu().numpy() l_val, l_id = iou_l.max(dim=0) l_val, l_id = l_val.cpu().numpy(), l_id.cpu().numpy() for i in range(len(l_boxlist)): l_bbox = l_boxlist.bbox[i, :].cpu().numpy() l_bbox = np.round(l_bbox).astype(np.int64) # 取整,以便索引 area = (l_bbox[3] - l_bbox[1]) * (l_bbox[2] - l_bbox[0]) if ((g_label[l_id[i]] != l_label[i] and not class_independ) or l_val[i] < iou_miss_thrs) and area != 0: agmap[1, l_bbox[1]:l_bbox[3], l_bbox[0]:l_bbox[2]] += l_score[i] / area # 低分辨率误检收益 agmap = torch.from_numpy(agmap).unsqueeze(dim=0) with torch.no_grad(): # agmap = agmap_avgpool(agmap) agmap = F.interpolate(agmap, size=agmap_size, mode='bilinear', align_corners=False) agmap = np.squeeze(agmap.cpu().numpy()) return agmap
def inference_with_agmap(image_dir, ldet_dir, c5_dir, config_file, weight_file, gt=None): """ 读取原图像,计算粗分辨率检测结果,取出c5特征,计算agmap,分割,进行高分辨率推理,组合结果 :param image_dir (str): 输入原始图像文件夹 :param ldet_dir (str): 输入粗分辨率检测结果文件夹(spire格式json) :param c5_dir (str): 输入原始图像检测网络中的c5特征文件夹(npy格式) :return: None """ image_fns = [] fns = os.listdir(image_dir) fns.sort() # 加载cr模型 checkpoint = torch.load(cr_saving_fn) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = CRNet(input_channels=2048).to(device) model.load_state_dict(checkpoint['state_dict']) model.eval() from spicv.spire_anno import SpireAnno from spicv.detection.structures.boxlist_ops import cat_boxlist from spicv.detection.structures.boxlist_ops import ignored_regions_iop spire_anno = SpireAnno(dataset='coco') detector = SpireDetector(config_file, weight_file, origin_size=True) for f in fns: if f.endswith('.jpg') or f.endswith('.jpeg') or f.endswith('.png'): ldet_fn = os.path.join(ldet_dir, f) + '.json' ldet_boxlist = spire_anno.to_boxlist(ldet_fn) c5_fn = os.path.join(c5_dir, f) + '.npy' ## 直接从硬盘中读取预存的c5特征 c5_torch = torch.from_numpy(np.load(c5_fn)).to(device) with torch.no_grad(): c5_torch = F.interpolate(c5_torch, size=c5_size, mode='bilinear', align_corners=False) output = model(c5_torch) predicted_agmap = np.squeeze(output.cpu().numpy()) sub_wins, h_size = agmap_segmentation(predicted_agmap, ldet_boxlist.size, 800) image_fn = os.path.join(image_dir, f) image_fns.append(image_fn) image = cv2.imread(image_fn) image_h = image.copy() image_l = image.copy() image = cv2.resize(image, h_size) prediction_list = [] for win in sub_wins: cv2.rectangle(image, (win[0], win[1]), (win[0] + win[2], win[1] + win[3]), (0, 210, 0), 2) image_win = image[win[1]:win[1] + win[3], win[0]:win[0] + win[2], :] prediction = detector.detect(image_win) prediction.bbox[:, 0] += win[0] prediction.bbox[:, 1] += win[1] prediction.bbox[:, 2] += win[0] prediction.bbox[:, 3] += win[1] prediction.size = h_size prediction = prediction.resize(ldet_boxlist.size) prediction_list.append(prediction) if len(sub_wins) > 0: sub_wins = BoxList(sub_wins, h_size, mode='xywh').resize( ldet_boxlist.size).convert(mode='xyxy') iou = ignored_regions_iop(sub_wins, ldet_boxlist, use_bbox=True) ldet_val, ldet_id = iou.max(dim=0) ldet_boxlist = ldet_boxlist[ldet_val < 0.5] prediction_list.append(ldet_boxlist) predictions = cat_boxlist(prediction_list) spire_anno.from_maskrcnn_benchmark(predictions, f, image.shape) image_show = spire_anno.visualize_boxlist(image, predictions, score_th=0.01) ## 以下只是为了显示 hdet_fn = os.path.join(h_det_dir, f) + '.json' hdet_boxlist = spire_anno.to_boxlist(hdet_fn) image_h = spire_anno.visualize_boxlist(image_h, hdet_boxlist, score_th=0.01) cv2.imshow('image_h', image_h) ldet_fn = os.path.join(l_det_dir, f) + '.json' ldet_boxlist = spire_anno.to_boxlist(ldet_fn) image_l = spire_anno.visualize_boxlist(image_l, ldet_boxlist, score_th=0.01) cv2.imshow('image_l', image_l) agmap_fn = os.path.join(agmap_saving_dir_train, f) + '.npy' agmap = cv2.resize(np.load(agmap_fn), ldet_boxlist.size) agmap_color = agmap * alpha agmap_color = (agmap_color + 1) * 127 agmap_color[agmap_color > 255] = 255 agmap_color[agmap_color < 0] = 0 agmap_color = cv2.applyColorMap(agmap_color.astype(np.uint8), cv2.COLORMAP_HOT) cv2.imshow("agmap", agmap_color) agmap_fn = os.path.join(predicted_agmap_dir, f) + '.npy' agmap = cv2.resize(np.load(agmap_fn), ldet_boxlist.size) agmap_color = (agmap + 1) * 127 agmap_color[agmap_color > 255] = 255 agmap_color[agmap_color < 0] = 0 agmap_color = cv2.applyColorMap(agmap_color.astype(np.uint8), cv2.COLORMAP_HOT) cv2.imshow("prediected_agmap", agmap_color) cv2.imshow('image', image_show) cv2.waitKey(100) print(f) if gt is not None: eval_res = spire_anno.cocoapi_eval(gt) print('inference done!')
def agmap_total(gt_boxlist, l_boxlist, h_boxlist, class_independ=False, keep_small=True, reward=False, verbose=False): """ 利用真实值,生成agmap :param gt_boxlist (BoxList): 真实目标框,必须是xyxy类型 :param l_boxlist (BoxList): 低分辨率检测结果,必须是xyxy类型 :param h_boxlist (BoxList): 高分辨率检测结果,必须是xyxy类型 :param class_independ (bool): 是否类别无关,只考虑proposal之间的iou :param keep_small (bool): 只计算小目标(< 96x96)的agmap :return: agmap (np.ndarray) """ # 是否去除大目标,只计算96x96以下目标的agmap if keep_small: gt_area = gt_boxlist.area() l_area = l_boxlist.area() h_area = h_boxlist.area() gt_keep, l_keep, h_keep = gt_area < np.square(96), l_area < np.square( 96), h_area < np.square(96) if torch.sum(gt_keep) == 0: gt_boxlist = BoxList([[0, 0, 0, 0]], gt_boxlist.size, mode="xyxy") gt_boxlist.add_field("labels", torch.as_tensor([0], dtype=torch.int64)) else: gt_boxlist = gt_boxlist[gt_keep] if torch.sum(l_keep) == 0: l_boxlist = BoxList([[0, 0, 0, 0]], l_boxlist.size, mode="xyxy") l_boxlist.add_field("labels", torch.as_tensor([0], dtype=torch.int64)) l_boxlist.add_field("scores", torch.as_tensor([0], dtype=torch.float32)) else: l_boxlist = l_boxlist[l_keep] if torch.sum(h_keep) == 0: h_boxlist = BoxList([[0, 0, 0, 0]], h_boxlist.size, mode="xyxy") h_boxlist.add_field("labels", torch.as_tensor([0], dtype=torch.int64)) h_boxlist.add_field("scores", torch.as_tensor([0], dtype=torch.float32)) else: h_boxlist = h_boxlist[h_keep] # gt_boxlist.size为(image_width, image_height),转置后获得正确尺寸 agmap = np.zeros(gt_boxlist.size, np.float32).T # 将收益分误检和漏检 agmap_split = np.zeros((1, gt_boxlist.size[1], gt_boxlist.size[0]), np.float32) # 用于reward评价 agval = 0. # 用ground-truth显示agmap,或者用l_det显示 use_gt_bbox = True for i in range(len(gt_boxlist)): g_bbox_i = gt_boxlist[i] g_label = g_bbox_i.get_field("labels").item() if class_independ: l_boxlist_sel = l_boxlist h_boxlist_sel = h_boxlist else: l_boxlist_sel = l_boxlist[l_boxlist.get_field("labels") == g_label] # 正确召回的类别 h_boxlist_sel = h_boxlist[h_boxlist.get_field("labels") == g_label] if len(l_boxlist_sel) == 0: l_boxlist_sel = BoxList([[0, 0, 0, 0]], l_boxlist_sel.size, mode="xyxy") l_boxlist_sel.add_field( "scores", torch.as_tensor([0], dtype=torch.float32)) if len(h_boxlist_sel) == 0: h_boxlist_sel = BoxList([[0, 0, 0, 0]], h_boxlist_sel.size, mode="xyxy") h_boxlist_sel.add_field( "scores", torch.as_tensor([0], dtype=torch.float32)) l_score = l_boxlist_sel.get_field("scores").cpu().numpy() h_score = h_boxlist_sel.get_field("scores").cpu().numpy() iou_l = boxlist_iou(g_bbox_i, l_boxlist_sel) iou_h = boxlist_iou(g_bbox_i, h_boxlist_sel) l_val, l_id = iou_l.max(dim=1) l_val, l_id = l_val.item(), l_id.item() # g_bbox_i只有一个元素 h_val, h_id = iou_h.max(dim=1) h_val, h_id = h_val.item(), h_id.item() # g_bbox_i只有一个元素 # 首先根据ground-truth对agmap进行评分,分为3种情况,l和h都召回目标,l召回目标,h召回目标 # g_bbox = gt_boxlist.bbox[i, :].cpu().numpy() g_bbox = g_bbox_i.bbox[0, :].cpu().numpy() g_bbox = np.round(g_bbox).astype(np.int64) # 取整,以便索引 g_area = (g_bbox[3] - g_bbox[1]) * (g_bbox[2] - g_bbox[0]) l_bbox = l_boxlist_sel.bbox[l_id, :].cpu().numpy() l_bbox = np.round(l_bbox).astype(np.int64) l_area = (l_bbox[3] - l_bbox[1]) * (l_bbox[2] - l_bbox[0]) if l_val > iou_thrs and h_val > iou_thrs: ag = h_score[h_id] - l_score[l_id] elif l_val > iou_thrs: # 高分辨率漏检收益 ag = -l_score[l_id] elif h_val > iou_thrs: # 低分辨率漏检收益 ag = h_score[h_id] if g_area != 0: agmap_split[0, g_bbox[1]:g_bbox[3], g_bbox[0]:g_bbox[2]] += ag / g_area else: ag = 0 agval += ag if use_gt_bbox and g_area != 0: # 使用ground-truth目标框来改变agmap的得分 agmap[g_bbox[1]:g_bbox[3], g_bbox[0]:g_bbox[2]] += ag / g_area elif l_area != 0: agmap[l_bbox[1]:l_bbox[3], l_bbox[0]:l_bbox[2]] += ag / l_area iou_l = boxlist_iou(gt_boxlist, l_boxlist) iou_h = boxlist_iou(gt_boxlist, h_boxlist) l_score = l_boxlist.get_field("scores").cpu().numpy() h_score = h_boxlist.get_field("scores").cpu().numpy() l_label = l_boxlist.get_field("labels").cpu().numpy() h_label = h_boxlist.get_field("labels").cpu().numpy() g_label = gt_boxlist.get_field("labels").cpu().numpy() l_val, l_id = iou_l.max(dim=0) l_val, l_id = l_val.cpu().numpy(), l_id.cpu().numpy() h_val, h_id = iou_h.max(dim=0) h_val, h_id = h_val.cpu().numpy(), h_id.cpu().numpy() for i in range(len(l_boxlist)): l_bbox = l_boxlist.bbox[i, :].cpu().numpy() l_bbox = np.round(l_bbox).astype(np.int64) # 取整,以便索引 area = (l_bbox[3] - l_bbox[1]) * (l_bbox[2] - l_bbox[0]) if ((g_label[l_id[i]] != l_label[i] and not class_independ) or l_val[i] < iou_thrs) and area != 0: agval += l_score[i] agmap[l_bbox[1]:l_bbox[3], l_bbox[0]:l_bbox[2]] += l_score[i] / area # 低分辨率误检收益 for i in range(len(h_boxlist)): h_bbox = h_boxlist.bbox[i, :].cpu().numpy() h_bbox = np.round(h_bbox).astype(np.int64) # 取整,以便索引 area = (h_bbox[3] - h_bbox[1]) * (h_bbox[2] - h_bbox[0]) if ((g_label[h_id[i]] != h_label[i] and not class_independ) or h_val[i] < iou_thrs) and area != 0: agval -= h_score[i] agmap[h_bbox[1]:h_bbox[3], h_bbox[0]:h_bbox[2]] -= h_score[i] / area # 高分辨率误检收益 agmap = torch.from_numpy(agmap).unsqueeze(dim=0).unsqueeze(dim=0) agmap_split = torch.from_numpy(agmap_split).unsqueeze(dim=0) with torch.no_grad(): # agmap = agmap_avgpool(agmap) agmap = F.interpolate(agmap, size=agmap_size, mode='bilinear', align_corners=False) agmap_split = F.interpolate(agmap_split, size=agmap_size, mode='bilinear', align_corners=False) agmap = np.squeeze(agmap.cpu().numpy()) agmap_split = np.squeeze(agmap_split.cpu().numpy()) if verbose: # 从[-1,1]转换到[0,255],用以colormap可视化 agmap_color = agmap * alpha agmap_color = cv2.resize(agmap_color, gt_boxlist.size) agmap_color = (agmap_color + 1) * 127 agmap_color[agmap_color > 255] = 255 agmap_color[agmap_color < 0] = 0 agmap_color = cv2.applyColorMap(agmap_color.astype(np.uint8), cv2.COLORMAP_HOT) cv2.imshow("agmap", agmap_color) cv2.waitKey(200) if reward: return agval else: return agmap, agmap_split
def __getitem__(self, idx): # idx %= 1 if self.use_mask: coco = self.coco img_id = self.ids[idx] ann_ids = coco.getAnnIds(imgIds=img_id) anno = coco.loadAnns(ann_ids) path = coco.loadImgs(img_id)[0]['file_name'] # filter crowd annotations # TODO might be better to add an extra field # anno = [obj for obj in anno if obj["iscrowd"] == 0] masks = [obj["segmentation"] for obj in anno] # RLE interpretation rle_sizes = [tuple(inst["size"]) for inst in masks] assert rle_sizes.count(rle_sizes[0]) == len(rle_sizes), ( "All the sizes must be the same size: %s" % rle_sizes) # in RLE, height come first in "size" rle_height, rle_width = rle_sizes[0] masks = mask_utils.decode(masks) # [h, w, n] image = cv2.cvtColor(cv2.imread(os.path.join(self.root, path)), cv2.COLOR_BGR2RGB) if self.data_aug: image, window, scale, padding, crop = self.resize_image( image, min_dim=512, max_dim=512, min_scale=False, mode='crop', aspect_ratio=1.3, # 1.5 zoom=1.5, # 1.7 min_enlarge=1.2, # 1.5 ) masks = self.resize_mask(masks, scale, padding, crop) if random.randint(0, 1): image = np.ascontiguousarray(np.fliplr(image)) masks = np.ascontiguousarray(np.fliplr(masks)) if random.randint(0, 1): image = np.ascontiguousarray(np.flipud(image)) masks = np.ascontiguousarray(np.flipud(masks)) ## Random rotation coin = np.random.random() if coin < 0.25: k = 1 elif (coin >= 0.25 and coin < 0.5): k = 2 elif (coin >= 0.5 and coin < 0.75): k = 3 else: k = 0 image = np.rot90(image, k=k, axes=(0, 1)) masks = np.rot90(masks, k=k, axes=(0, 1)) rot_range = 10. # 22.5 channel_shift_range = 15 # 20 if np.random.uniform(0, 1) > 0.5: image, masks = self.img_rot(image, masks, angle=np.random.uniform( -rot_range, rot_range)) image = self.random_channel_shift(image, channel_shift_range, 2) # Note that some boxes might be all zeros if the corresponding mask got cropped out. # and here is to filter them out _idx = np.sum(masks, axis=(0, 1)) > 0 masks = masks[:, :, _idx] # Bounding boxes. Note that some boxes might be all zeros # if the corresponding mask got cropped out. # bbox: [num_instances, (y1, x1, y2, x2)] boxes = self.extract_bboxes(masks) # visualize_datasets.vis_one_training_image(image, str(img_id), # '/media/fs3017/eeum/nuclei/test', # boxes, masks, is_box_xyxy=True) img = Image.fromarray(image) target = BoxList(torch.as_tensor(boxes), img.size, mode="xyxy") classes = [obj["category_id"] for obj in anno] classes = np.array([ self.json_category_id_to_contiguous_id[c] for c in classes ])[_idx] classes = torch.as_tensor(classes) target.add_field("labels", classes) is_crowd = np.array([obj["iscrowd"] for obj in anno])[_idx] is_crowd = torch.as_tensor(is_crowd) target.add_field("is_crowd", is_crowd) # print(masks.shape) # print(np.array([obj["iscrowd"] == 0 for obj in anno])[_idx]) non_crowd_masks = masks[:, :, np.array([ obj["iscrowd"] == 0 for obj in anno ])[_idx]] if non_crowd_masks.size == 0: non_crowd_masks = np.zeros(shape=(masks.shape[0], masks.shape[1], 1)) centerness = scipy.ndimage.zoom(non_crowd_masks.max(axis=2), zoom=[0.25, 0.25], order=0) centerness = (centerness > 0).astype(np.float32) centerness[centerness == 0] = -1. centerness[centerness > 0] = 0. center_scale = 0.3 gt_bbox = np.zeros(shape=(centerness.shape[0], centerness.shape[1], 4)) anchor_bbox = np.zeros(shape=gt_bbox.shape) for xx in range(centerness.shape[1]): for yy in range(centerness.shape[0]): anchor_bbox[yy, xx, :] = [ max(0.0, xx * 4 - 16), max(0.0, yy * 4 - 16), min(xx * 4 + 16, masks.shape[1]), min(yy * 4 + 16, masks.shape[0]) ] for bi, box in enumerate(boxes): if is_crowd[bi]: continue x, y, xe, ye = box w = xe - x h = ye - y ctr_x = x * 0.25 + w * 0.25 * 0.5 ctr_y = y * 0.25 + h * 0.25 * 0.5 hw = w * 0.25 * 0.5 * center_scale hh = h * 0.25 * 0.5 * center_scale sx = math.floor(ctr_x - hw) sy = math.floor(ctr_y - hh) ex = max(sx + 1, math.ceil(ctr_x + hw)) ey = max(sy + 1, math.ceil(ctr_y + hh)) centerness[sy:ey, sx:ex] = 1. gt_bbox[sy:ey, sx:ex, :] = [x, y, xe, ye] masks = torch.tensor(masks).permute(2, 0, 1) # [n, h, w] assert masks.shape[1] == img.size[1] assert masks.shape[2] == img.size[0] masks = SegmentationMask(masks, img.size, mode='mask') target.add_field("masks", masks) if self._transforms is not None: img, target = self._transforms(img, target) else: if self.is_train: if random.randint(0, 1): image = np.ascontiguousarray(np.fliplr(image)) masks = np.ascontiguousarray(np.fliplr(masks)) if random.randint(0, 1): image = np.ascontiguousarray(np.flipud(image)) masks = np.ascontiguousarray(np.flipud(masks)) # boxes = [obj["bbox"] for obj in anno] boxes = self.extract_bboxes(masks) # visualize_datasets.vis_one_training_image(image, str(img_id), # '/media/fs3017/eeum/nuclei/test', # boxes, masks, is_box_xyxy=False) img = Image.fromarray(image) target = BoxList(torch.as_tensor(boxes), img.size, mode="xyxy") classes = [obj["category_id"] for obj in anno] classes = [ self.json_category_id_to_contiguous_id[c] for c in classes ] classes = torch.tensor(classes) target.add_field("labels", classes) is_crowd = [obj["iscrowd"] > 0 for obj in anno] is_crowd = torch.as_tensor(is_crowd) target.add_field("is_crowd", is_crowd) non_crowd_masks = masks[:, :, np.array([ obj["iscrowd"] == 0 for obj in anno ])] centerness = scipy.ndimage.zoom(non_crowd_masks.max(axis=2), zoom=[0.25, 0.25], order=0) centerness = (centerness > 0).astype(np.float32) centerness[centerness == 0] = -1. centerness[centerness > 0] = 0. center_scale = 0.3 gt_bbox = np.zeros(shape=(centerness.shape[0], centerness.shape[1], 4)) anchor_bbox = np.zeros(shape=gt_bbox.shape) for xx in range(centerness.shape[1]): for yy in range(centerness.shape[0]): anchor_bbox[yy, xx, :] = [ max(0.0, xx * 4 - 16), max(0.0, yy * 4 - 16), min(xx * 4 + 16, masks.shape[1]), min(yy * 4 + 16, masks.shape[0]) ] for bi, box in enumerate(boxes): if is_crowd[bi]: continue x, y, xe, ye = box w = xe - x h = ye - y ctr_x = x * 0.25 + w * 0.25 * 0.5 ctr_y = y * 0.25 + h * 0.25 * 0.5 hw = w * 0.25 * 0.5 * center_scale hh = h * 0.25 * 0.5 * center_scale sx = math.floor(ctr_x - hw) sy = math.floor(ctr_y - hh) ex = max(sx + 1, math.ceil(ctr_x + hw)) ey = max(sy + 1, math.ceil(ctr_y + hh)) centerness[sy:ey, sx:ex] = 1. gt_bbox[sy:ey, sx:ex, :] = [x, y, xe, ye] # print(gt_bbox[sy, sx, :], anchor_bbox[sy, sx, :]) masks = torch.tensor(masks).permute(2, 0, 1) # [n, h, w] assert masks.shape[1] == rle_height == img.size[1] assert masks.shape[2] == rle_width == img.size[0] masks = SegmentationMask(masks, img.size, mode='mask') target.add_field("masks", masks) target = target.clip_to_image(remove_empty=True) if self._transforms is not None: img, target = self._transforms(img, target) # print(anchor_bbox, gt_bbox) return img, target, idx, \ (torch.as_tensor(centerness), torch.as_tensor(gt_bbox), torch.as_tensor(anchor_bbox)) img, anno = super(COCODataset, self).__getitem__(idx) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) if anno and "segmentation" in anno[0]: masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size, mode='poly') target.add_field("masks", masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = PersonKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) target = target.clip_to_image(remove_empty=True) if self._transforms is not None: img, target = self._transforms(img, target) return img, target, idx
def filter_results(self, boxlist_left, boxlist_right, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes_left = boxlist_left.bbox.reshape(-1, num_classes * 4) boxes_right = boxlist_right.bbox.reshape(-1, num_classes * 4) boxes_union = boxlist_union(boxlist_left, boxlist_right).bbox.reshape( -1, num_classes * 4) scores = boxlist_left.get_field("scores").reshape(-1, num_classes) device = scores.device result_left, result_right = [], [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class inds_all = scores > self.score_thresh for j in range(1, num_classes): inds = inds_all[:, j].nonzero().squeeze(1) scores_j = scores[inds, j] boxes_j = boxes_union[inds, j * 4:(j + 1) * 4] boxes_j_left = boxes_left[inds, j * 4:(j + 1) * 4] boxes_j_right = boxes_right[inds, j * 4:(j + 1) * 4] # if j==1:print(inds_all[:, j]) boxlist_for_class = BoxList(boxes_j, boxlist_left.size, mode="xyxy") boxlist_for_class_left = BoxList(boxes_j_left, boxlist_left.size, mode="xyxy") boxlist_for_class_right = BoxList(boxes_j_right, boxlist_left.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class_left.add_field("scores", scores_j) boxlist_for_class_right.add_field("scores", scores_j) idxs = boxlist_nms_idx(boxlist_for_class, self.nms) boxlist_for_class_left = boxlist_for_class_left[idxs] boxlist_for_class_right = boxlist_for_class_right[idxs] num_labels = len(boxlist_for_class_left) boxlist_for_class_left.add_field( "labels", torch.full((num_labels, ), j, dtype=torch.int64, device=device)) boxlist_for_class_right.add_field( "labels", torch.full((num_labels, ), j, dtype=torch.int64, device=device)) result_left.append(boxlist_for_class_left) result_right.append(boxlist_for_class_right) result_left = cat_boxlist(result_left) result_right = cat_boxlist(result_right) number_of_detections = len(result_left) # Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result_left.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.detections_per_img + 1) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result_left = result_left[keep] result_right = result_right[keep] return result_left, result_right
def depth_evaluation( dataset, predictions, output_folder, box_only, iou_types, expected_results, expected_results_sigma_tol, score_threshold=0.05, bbox_iou_threshold=0.5, height_to_depth=False, ): logger = logging.getLogger("maskrcnn_benchmark.inference") logger.info("Preparing results for Depth Evaluation") # result table "file_name" : result depth_results = {} gt_box_num = 0 for image_id, prediction in enumerate(predictions): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] file_name = img_info["file_name"] # ground truth # img, gt, idx = dataset[original_id] # TODO: load gt only ann_ids = dataset.coco.getAnnIds(imgIds=original_id) anno = dataset.coco.loadAnns(ann_ids) # filter crowd annotations # TODO might be better to add an extra field if hasattr(dataset, 'remove_truncated') and dataset.remove_truncated: anno = [obj for obj in anno if obj["truncated"] == 0] if hasattr(dataset, 'class_filter_list') and len(dataset.class_filter_list) > 0: anno = [ obj for obj in anno if obj["category_id"] in dataset.class_filter_list ] depth_key = dataset.depth_key if hasattr(dataset, 'depth_key') else "depth" input_depth_mode = dataset.input_depth_mode if hasattr( dataset, 'input_depth_mode') else depth_key output_depth_mode = dataset.output_depth_mode if hasattr( dataset, 'output_depth_mode') else "depth" min_value = dataset.min_value if hasattr(dataset, 'min_value') else 0.1 max_value = dataset.max_value if hasattr(dataset, 'max_value') else 100 boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, (image_width, image_height), mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [ dataset.json_category_id_to_contiguous_id[c] for c in classes ] classes = torch.tensor(classes) target.add_field("labels", classes) if height_to_depth: height = [obj["height_rw"] for obj in anno] height = torch.tensor(height) target.add_field("depths", height) target = _height_to_depth(target, img_info) elif anno and depth_key in anno[0]: depth = [obj[depth_key] for obj in anno] # depth = torch.tensor(depth) depth = PointDepth( depth, (image_width, image_height), focal_length=img_info["camera_params"]["intrinsic"]["fx"], baseline=img_info["camera_params"]["extrinsic"]["baseline"], min_value=min_value, max_value=max_value, mode=input_depth_mode) target.add_field("depths", depth) gt = target.resize((image_width, image_height)) gt_boxes = gt.bbox.tolist() if len(gt_boxes) == 0: continue gt_box_num += len(gt_boxes) gt_labels = gt.get_field("labels").tolist() gt_depths = gt.get_field('depths').convert("depth").depths.tolist() # print(gt_depths) gt_mapped_labels = [ dataset.contiguous_category_id_to_json_id[i] for i in gt_labels ] prediction = prediction.resize((image_width, image_height)) prediction = prediction.convert("xyxy") # print(prediction) scores = prediction.get_field("scores") positive_indices = scores > score_threshold scores = scores.tolist() boxes = prediction.bbox[positive_indices].tolist() if len(boxes) == 0: continue labels = prediction.get_field("labels")[positive_indices].tolist() if height_to_depth: prediction = _height_to_depth(prediction, img_info) depths = prediction.get_field('depths')[ positive_indices] # .convert("depth").depths if isinstance(depths, PointDepth): depths = depths #.convert(output_depth_mode) else: depths = PointDepth( depths, (image_width, image_height), focal_length=img_info["camera_params"]["intrinsic"]["fx"], baseline=img_info["camera_params"]["extrinsic"]["baseline"], min_value=min_value, max_value=max_value, mode="depth") depths = depths.convert("depth") depths = depths.depths.tolist() # print(depths, gt_depths) mapped_labels = [ dataset.contiguous_category_id_to_json_id[i] for i in labels ] # find corresponding box overlaps = boxlist_iou(prediction[positive_indices], gt) gt_overlaps = torch.zeros(len(gt_boxes)) dt_matches = [-1] * len(boxes) for j in range(min(len(prediction), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) if gt_ovr < bbox_iou_threshold: continue assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] dt_matches[box_ind] = gt_ind # record the iou coverage of this gt box gt_overlaps[j] = overlaps[box_ind, gt_ind] assert gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # locations, rotation_y = ddd2locrot( # center, alpha, dimensions, depth, calibs[0]) depth_results[file_name] = [] # gt[file_name] = {} for k in range(len(boxes)): depth_results[file_name].append({ 'image_id': original_id, # 'calib': img_info['calib'], 'category_id': mapped_labels[k], 'bbox': boxes[k], 'depth': depths[k][0], 'gt_category_id': gt_mapped_labels[dt_matches[k]] if dt_matches[k] >= 0 else None, 'gt_bbox': gt_boxes[dt_matches[k]] if dt_matches[k] >= 0 else None, 'gt_depth': gt_depths[dt_matches[k]] if dt_matches[k] >= 0 else None, 'score': scores[k], }) # for k in range(len(gt_boxes)): # gt[file_name].append({ # 'image_id': original_id, # 'calib': img_info['calib'], # 'category_id': gt_mapped_labels[k], # 'bbox': gt_boxes[k], # 'depth': gt_depths[k], # }) logger.info("Evaluating predictions") logger.info("Ground Truth boxes %d" % gt_box_num) results = evaluate_results(depth_results) import json logger.info(json.dumps(results, sort_keys=True, indent=4)) return results
def __getitem__(self, idx): # img, anno = super(CityScapesWDDataset, self).__getitem__(idx) coco = self.coco img_id = self.ids[idx] ann_ids = coco.getAnnIds(imgIds=img_id) anno = coco.loadAnns(ann_ids) img_info = coco.loadImgs(img_id)[0] path = img_info['file_name'] # right_path = img_info['right_file_name'] img = Image.open(os.path.join(self.root, path)).convert('RGB') # right_img = Image.open(os.path.join(self.root, right_path)).convert('RGB') if self.transform is not None: img = self.transform(img) # right_img = self.transform(right_img) if self.target_transform is not None: anno = self.target_transform(anno) # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0] if len(self.class_filter_list) > 0: anno = [ obj for obj in anno if obj["category_id"] in self.class_filter_list ] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) if anno and "segmentation" in anno[0]: masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size, mode='poly') target.add_field("masks", masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = PersonKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) # if anno and "height_rw" in anno[0]: # depth = [obj["height_rw"] for obj in anno] if anno and self.depth_key in anno[0]: depth = [obj[self.depth_key] for obj in anno] # depth = torch.tensor(depth) depth = PointDepth( depth, img.size, focal_length=img_info["camera_params"]["intrinsic"]["fx"], baseline=img_info["camera_params"]["extrinsic"]["baseline"], min_value=self.depth_range[0], max_value=self.depth_range[1], mode=self.depth_key) # print(depth.depths) depth = depth.convert(self.output_depth_mode) # print(depth.depths) target.add_field("depths", depth) target = target.clip_to_image(remove_empty=True) # target.add_field("right_image", right_img) # print(target.get_field("depths").depths) if self._transforms is not None: img, target = self._transforms(img, target) # print(target.get_field("depths").depths) return img, target, idx
def __getitem__(self, idx): coco = self.coco img_id = self.ids[idx] ann_ids = coco.getAnnIds(imgIds=img_id) anno = coco.loadAnns(ann_ids) loaded_img = coco.loadImgs(img_id)[0] path = loaded_img['file_name'] img = Image.open(os.path.join(self.root, path)).convert('RGB') # if "angle" in loaded_img and loaded_img["angle"] is not 0: if 'angle' in loaded_img.keys() and loaded_img["angle"] is not 0: if loaded_img["angle"] == 90: img = img.rotate( 270, expand=True ) elif loaded_img["angle"] == 180: img = img.rotate( 180, expand=True ) elif loaded_img["angle"] == 270: img = img.rotate( 90, expand=True ) else: raise ValueError() # filter crowd annotations # TODO might be better to add an extra field anno = [obj for obj in anno if obj["iscrowd"] == 0 and obj["ignore"] == 0] """ boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") """ def to_rrect( x ): x = cv2.minAreaRect( x ) x = cv2.boxPoints( x ) return x # masks = [obj["segmentation"] for obj in anno] keypoints = np.array( [obj["keypoint"] for obj in anno] ) # keypoints = np.array(keypoints, dtype=np.float32).reshape(-1, 8) # keypoints = list( poly_to_rect(keypoints.reshape( (-1, 8) ) ) ) keypoints = np.array( keypoints, dtype=np.float32 ).reshape( (-1, 8) ) xmins = np.min( keypoints[:, ::2], axis=1 ) minx_idx = xmins < 1 xmins[minx_idx] = 1 ymins = np.min( keypoints[:, 1::2], axis=1 ) miny_idx = ymins < 1 ymins[miny_idx] = 1 xmaxs = np.max( keypoints[:, ::2], axis=1 ) maxx_idx = xmaxs > 1024 xmaxs[maxx_idx] = 1024 ymaxs = np.max( keypoints[:, 1::2], axis=1 ) maxy_idx = ymaxs > 1024 ymaxs[maxy_idx] = 1024 xyxy = np.vstack( [xmins, ymins, xmaxs, ymaxs] ).transpose() boxes = torch.from_numpy( xyxy ).reshape(-1, 4) # guard against no boxes target = BoxList( boxes, img.size, mode="xyxy" ) keypoints = SegmentationMask( keypoints.reshape( (-1, 1, 8)).tolist(), img.size, mode='poly' ) target.add_field( "keypoints", keypoints ) classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) # NOTE Qimeng: close it for getting correct alpha #target = target.clip_to_image(remove_empty=True) if self._transforms is not None: img, target = self._transforms(img, target) return img, target, idx
def forward(self, features_, proposals, targets=None, query=False): """ Arguments: features (list[Tensor]): feature-maps from possibly several levels proposals (list[BoxList]): proposal boxes targets (list[BoxList], optional): the ground-truth targets. Returns: x (Tensor): the result of the feature extractor proposals (list[BoxList]): during training, the subsampled proposals are returned. During testing, the predicted boxlists are returned losses (dict[Tensor]): During training, returns the losses for the head. During testing, returns an empty dict. """ tmpp = [feature.shape[2:] for feature in features_] features = [ F.pad(feature, (0, 0, 0, size[0]), "constant", value=0.0) for size, feature in zip(tmpp, features_) ] if self.training: targets = self.ratio_estimator.get_ratio(targets, self.training) proposals = targets elif query: for target, proposal in zip(targets, proposals): target.add_field("embeds", proposal.get_field("embeds")) if self.padreg: for target, proposal in zip(targets, proposals): target.add_field("reg_vals", proposal.get_field("reg_vals")) proposals = targets query_get_ratio = self.ratio_estimator.get_ratio \ if self.query_by_gt else self.ratio_estimator.get_ratio_by_est proposals = query_get_ratio(proposals, self.training) else: proposals = targets elif self.padreg: old_proposals = proposals device_ = proposals[0].bbox.device return_bboxlist = [] for proposal in proposals: p_bbox = 1.0 * proposal.bbox new_bbox = [] n_proposal = p_bbox.shape[0] regvals = proposal.get_field("reg_vals") reg_vals = est_decode(regvals) # reg_valss = (reg_vals+1)/2 for j in range(n_proposal): bbox = p_bbox[j, :] h = bbox[3] - bbox[1] new_h = h * (1. / (1. - reg_vals[j])) bbox[3] = bbox[1] + new_h new_bbox.append(bbox.tolist()) if n_proposal == 0: new_bbox = torch.tensor([]).view(0, 4) new_bboxlist = BoxList(new_bbox, proposal.size, mode="xyxy") new_bboxlist._copy_extra_fields(proposal) return_bboxlist.append(new_bboxlist) return_bboxlist = [ return_box.to(device_) for return_box in return_bboxlist ] proposals = return_bboxlist else: pass # keep the proposals x = self.feature_extractor(features, proposals) # final classifier that converts the features into predictions part_feat = self.predictor(x) if not self.training: # when no training # for query, proposals are ground truth # for gallery, proposals are results, just add part_embeds on it if not query and self.padreg: proposals = self.exchange_box(old_proposals, proposals) result = self.post_processor(part_feat, proposals) return x, result, {} loss_part_oim = self.loss_evaluator(part_feat, targets) loss_dict = dict( zip([ "loss_reid_p" + str(i) for i in range(1, len(loss_part_oim) + 1) ], loss_part_oim)) return ( x, proposals, loss_dict, )
def forward_for_single_feature_map(self, anchors, objectness, box_regression): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * REGRESSION_CN, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) objectness = objectness.sigmoid() box_regression = permute_and_flatten(box_regression, N, A, REGRESSION_CN, H, W) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) # sorted! batch_idx = torch.arange(N, device=device)[:, None] box_regression = box_regression[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.get_field("rrects") for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, REGRESSION_CN)[batch_idx, topk_idx] proposals = self.box_coder.decode( box_regression.view(-1, REGRESSION_CN), concat_anchors.view(-1, REGRESSION_CN)) proposals = proposals.view(N, -1, REGRESSION_CN) result = [] for proposal, score, im_shape in zip(proposals, objectness, image_shapes): # filter small boxes if self.min_size > 0: keep = remove_small_boxes(proposal, self.min_size) proposal = proposal[keep] score = score[keep] # perform rotated nms keep = self.nms_rotate(proposal, score) proposal = proposal[keep] score = score[keep] # convert anchor rects to bboxes bboxes = convert_rects_to_bboxes(proposal, torch) boxlist = BoxList(bboxes, im_shape, mode="xyxy") boxlist.add_field("rrects", proposal) boxlist.add_field("objectness", score) boxlist = boxlist.clip_to_image(remove_empty=False) result.append(boxlist) return result
def forward_for_single_feature_map(self, anchors, box_cls, box_regression, pre_nms_thresh=0.05): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ device = box_cls.device N, _, H, W = box_cls.shape A = int(box_regression.size(1) / 4) # A means anchor number C = int(box_cls.size(1) / A) # C means class number # put in the same format as anchors box_cls = box_cls.view(N, -1, C, H, W).permute(0, 3, 4, 1, 2) # N means batch_size box_cls = box_cls.reshape(N, -1, C) # put matrix become a long vector box_cls = box_cls.sigmoid() # pass sigmoid function box_regression = box_regression.view(N, -1, 4, H, W) box_regression = box_regression.permute(0, 3, 4, 1, 2) box_regression = box_regression.reshape(N, -1, 4) # H means height and W means Width num_anchors = A * H * W results = [[] for _ in range(N)] candidate_inds = box_cls > pre_nms_thresh if candidate_inds.sum().item() == 0: # no one can pass pre_nme_thresh empty_boxlists = [] for a in anchors: empty_boxlist = BoxList(torch.Tensor(0, 4).to(device), a.size) empty_boxlist.add_field("labels", torch.LongTensor([]).to(device)) empty_boxlist.add_field("scores", torch.Tensor([]).to(device)) empty_boxlists.append(empty_boxlist) return empty_boxlists pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) for batch_idx, (per_box_cls, per_box_regression, per_pre_nms_top_n, \ per_candidate_inds, per_anchors) in enumerate(zip( box_cls, box_regression, pre_nms_top_n, candidate_inds, anchors)): # Sort and select TopN per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_class += 1 if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_box_loc = per_box_loc[top_k_indices] per_class = per_class[top_k_indices] detections = self.box_coder.decode( per_box_regression[per_box_loc, :].view(-1, 4), per_anchors.bbox[per_box_loc, :].view(-1, 4)) boxlist = BoxList(detections, per_anchors.size, mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results[batch_idx] = boxlist return results
def filter_results(self, boxlist, num_classes, new_thresh=None): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) device = scores.device result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class if not new_thresh: new_thresh = self.score_thresh inds_all = scores > new_thresh tmp_thresh = new_thresh # while inds_all.nonzero().shape[0] < 10: # less than 10 objects # print("less than 10 objects") # tmp_thresh /= 2 # inds_all = scores > tmp_thresh idx = [] # to record the index of selected candidates for j in range(1, num_classes): inds = inds_all[:, j].nonzero().squeeze(1) # print(scores[0]) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4 : (j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class, keep = boxlist_nms( boxlist_for_class, self.nms ) num_labels = len(boxlist_for_class) if len(inds) > 0: idx.append(inds[keep]) boxlist_for_class.add_field( "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) ) result.append(boxlist_for_class) if len(idx) > 0: idx = torch.cat(idx) # print(idx.shape[0]) result = cat_boxlist(result) number_of_detections = len(result) #print(number_of_detections) while number_of_detections < 10: # if detected objects less than 10 result, idx = self.filter_results(boxlist, num_classes, new_thresh=new_thresh/2) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] idx = idx[keep] return result, idx
def __getitem__(self, idx, raw_image=False): img, anno = super(COCODataset, self).__getitem__(idx) if raw_image: return img # filter crowd annotations # TODO might be better to add an extra field if self.is_ignore: _, anno_ignore = self.COCODataset_ignore.__getitem__(idx) anno_ignore = [obj for obj in anno_ignore if obj["iscrowd"] == 0] boxes_ignore = [obj["bbox"] for obj in anno_ignore] boxes_ignore = torch.as_tensor(boxes_ignore).reshape( -1, 4) # guard against no boxes target_ignore = BoxList(boxes_ignore, img.size, mode="xywh").convert("xyxy") classes_ignore = [obj["category_id"] for obj in anno_ignore] classes_ignore = [ self.json_category_id_to_contiguous_id[c] for c in classes_ignore ] classes_ignore = torch.tensor(classes_ignore) target_ignore.add_field("labels", classes_ignore) if anno_ignore and "segmentation" in anno_ignore[0]: masks_ignore = [obj["segmentation"] for obj in anno_ignore] masks_ignore = SegmentationMask(masks_ignore, img.size, mode='poly') target_ignore.add_field("masks", masks_ignore) target_ignore = target_ignore.clip_to_image(remove_empty=True) anno = [obj for obj in anno if obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") # print("box is same?", boxes, boxes_ignore) classes = [obj["category_id"] for obj in anno] classes = [self.json_category_id_to_contiguous_id[c] for c in classes] classes = torch.tensor(classes) target.add_field("labels", classes) if anno and "segmentation" in anno[0]: masks = [obj["segmentation"] for obj in anno] masks = SegmentationMask(masks, img.size, mode='poly') target.add_field("masks", masks) if anno and "keypoints" in anno[0]: keypoints = [obj["keypoints"] for obj in anno] keypoints = PersonKeypoints(keypoints, img.size) target.add_field("keypoints", keypoints) target = target.clip_to_image(remove_empty=True) # print(self._transforms) if self._transforms is not None: if self.is_ignore: img, target, target_ignore = self._transforms( img, target, target_ignore) else: img, target = self._transforms(img, target) if self.is_ignore: # print(target) # print(target_ignore) return img, target, idx, target_ignore else: return img, target, idx
def __getitem__(self, k): im_ori_RGB = Image.open(self.img_files[k]).convert( 'RGB') # im_ori_RGB.size: (W, H with open(self.pickle_files[k], 'rb') as filehandle: data = pickle.load(filehandle) bboxes = data['bboxes'].astype(np.float32) # [xywh] assert len(bboxes.shape) == 2 and bboxes.shape[1] == 4 num_bboxes_ori = bboxes.shape[0] if 'label' in data: labels = data['label'] # ['car', 'person', 'person'] else: labels = ['person'] * num_bboxes_ori # bboxes = np.load(self.bbox_npy_files[k]).astype(np.float32) # [xywh] if bboxes.shape[0] > self.cfg.DATA.COCO.GOOD_NUM: bboxes = bboxes[:self.cfg.DATA.COCO.GOOD_NUM, :] labels = labels[:self.cfg.DATA.COCO.GOOD_NUM] target_boxes = torch.as_tensor(bboxes).reshape( -1, 4) # guard against no boxes target = BoxList(target_boxes, im_ori_RGB.size, mode="xywh").convert("xyxy") num_boxes = target.bbox.shape[0] if self.opt.est_kps: if 'kps' in data: kps_gt = data['kps'].astype(int) # [N, 51] if num_bboxes_ori > self.cfg.DATA.COCO.GOOD_NUM: kps_gt = kps_gt[:self.cfg.DATA.COCO.GOOD_NUM, :] kps_gt = kps_gt.tolist() # [[51]] else: kps_gt = [[0] * 51 for i in range(num_boxes)] target_keypoints = PersonKeypoints(kps_gt, im_ori_RGB.size) # kps_sum = torch.sum(torch.sum(target_keypoints.keypoints[:, :, :2], 1), 1) # kps_mask = kps_sum != 0. # print(target_keypoints.keypoints.shape, kps_sum, kps_mask) target.add_field("keypoints", target_keypoints) # target.add_field("keypoints_mask", kps_mask) target = target.clip_to_image(remove_empty=True) classes = [1] * num_boxes # !!!!! all person (1) for now... classes = [ self.json_category_id_to_contiguous_id[c] for c in classes ] classes = torch.tensor(classes) target.add_field("labels", classes) scores = torch.tensor([1.] * target.bbox.shape[0]) target.add_field("scores", scores) W, H = im_ori_RGB.size[:2] if self.train: yannick_results = loadmat(self.yannick_mat_files[k]) horizon_visible = yannick_results['horizon_visible'][0][0].astype( np.float32) assert horizon_visible == 1 horizon = yannick_results['pitch'][0][0].astype(np.float32) horizon_pixels_yannick = H * horizon v0 = H - horizon_pixels_yannick vfov = yannick_results['vfov'][0][0].astype(np.float32) f_pixels_yannick = H / 2. / (np.tan(vfov / 2.)) else: f_pixels_yannick = -1 v0 = -1 im_yannickTransform = self.transforms_yannick( im_ori_RGB) # [0., 1.] by default im_maskrcnnTransform, target_maskrcnnTransform = self.transforms_maskrcnn( im_ori_RGB, target) # [0., 1.] by default # print('---', im.size(), np.asarray(im).shape) # im_array = np.asarray(im) # if len(im_array.shape)==2: # im_array = np.stack((im_array,)*3, axis=-1) # # print(im_array.shape) # x = torch.from_numpy(im_array.transpose((2,0,1))) if self.train and self.opt.est_kps: target_maskrcnnTransform.add_field("keypoints_ori", target_keypoints) target_maskrcnnTransform.add_field("boxlist_ori", target) target_maskrcnnTransform.add_field('img_files', [self.img_files[k]] * num_boxes) if self.train: y_person = 1.75 bbox_good_list = bboxes vc = H / 2. inv_f2_yannick = 1. / (f_pixels_yannick * f_pixels_yannick) yc_list = [] for bbox in bbox_good_list: vt = H - bbox[1] vb = H - (bbox[1] + bbox[3]) # v0_single = yc * (vt - vb) / y_person + vb yc_single = y_person * (v0 - vb) / (vt - vb) / ( 1. + (vc - v0) * (vc - vt) / f_pixels_yannick**2) yc_list.append(yc_single) yc_estCam = np.median(np.asarray(yc_list)) else: yc_estCam = -1 assert len(labels) == bboxes.shape[0] # im_ori_BGR_array = np.array(im_ori_RGB.copy())[:,:,::-1] return im_yannickTransform, im_maskrcnnTransform, W, H, \ float(yc_estCam), \ self.pad_bbox(bboxes, self.GOOD_NUM).astype(np.float32), bboxes.shape[0], float(v0), float(f_pixels_yannick), \ os.path.basename(self.img_files[k])[:12], self.img_files[k], target_maskrcnnTransform, labels
W = 240 device = 'cpu' image = torch.zeros(N, C, H, W, device=device) targets = np.array([[50, 50, 100, 100, 0], [50, 50, 50, 50, -90]], dtype=np.float32) bbox_targets = np.array([[0, 0, 100, 100], [25, 25, 75, 75]], dtype=np.float32) targets = [targets for ix in range(N)] bbox_targets = [bbox_targets for ix in range(N)] test_rpn_post_processor(image, bbox_targets) test_rrpn_post_processor(image, targets) from maskrcnn_benchmark.modeling.rrpn.utils import get_segmentation_mask_rotated_rect_tensor tt = [] for ix, td in enumerate(targets): rect_pts = convert_rect_to_pts2(td) #.reshape((len(td), 8)) nn = len(rect_pts) bboxes = np.zeros((nn, 4), dtype=np.float32) bboxes[:, :2] = np.min(rect_pts, axis=1) bboxes[:, 2:] = np.max(rect_pts, axis=1) boxlist = BoxList(bboxes, (W, H), mode="xyxy") mm = SegmentationMask(rect_pts.reshape(nn, 1, 8).tolist(), (W, H), mode='poly') boxlist.add_field("masks", mm) tt.append(boxlist) rrect_tensor = get_segmentation_mask_rotated_rect_tensor(mm)
def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5): """Calculate precision and recall based on evaluation code of PASCAL VOC. This function calculates precision and recall of predicted bounding boxes obtained from a dataset which has :math:`N` images. The code is based on the evaluation code used in PASCAL VOC Challenge. """ n_pos = defaultdict(int) score = defaultdict(list) match = defaultdict(list) for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists): pred_bbox = pred_boxlist.bbox.numpy() pred_label = pred_boxlist.get_field("labels").numpy() pred_score = pred_boxlist.get_field("scores").numpy() gt_bbox = gt_boxlist.bbox.numpy() gt_label = gt_boxlist.get_field("labels").numpy() gt_difficult = gt_boxlist.get_field("difficult").numpy() for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): pred_mask_l = pred_label == l pred_bbox_l = pred_bbox[pred_mask_l] pred_score_l = pred_score[pred_mask_l] # sort by score order = pred_score_l.argsort()[::-1] pred_bbox_l = pred_bbox_l[order] pred_score_l = pred_score_l[order] gt_mask_l = gt_label == l gt_bbox_l = gt_bbox[gt_mask_l] gt_difficult_l = gt_difficult[gt_mask_l] n_pos[l] += np.logical_not(gt_difficult_l).sum() score[l].extend(pred_score_l) if len(pred_bbox_l) == 0: continue if len(gt_bbox_l) == 0: match[l].extend((0, ) * pred_bbox_l.shape[0]) continue # VOC evaluation follows integer typed bounding boxes. pred_bbox_l = pred_bbox_l.copy() pred_bbox_l[:, 2:] += 1 gt_bbox_l = gt_bbox_l.copy() gt_bbox_l[:, 2:] += 1 iou = boxlist_iou( BoxList(pred_bbox_l, gt_boxlist.size), BoxList(gt_bbox_l, gt_boxlist.size), ).numpy() gt_index = iou.argmax(axis=1) # set -1 if there is no matching ground truth gt_index[iou.max(axis=1) < iou_thresh] = -1 del iou selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) for gt_idx in gt_index: if gt_idx >= 0: if gt_difficult_l[gt_idx]: match[l].append(-1) else: if not selec[gt_idx]: match[l].append(1) else: match[l].append(0) selec[gt_idx] = True else: match[l].append(0) n_fg_class = max(n_pos.keys()) + 1 prec = [None] * n_fg_class rec = [None] * n_fg_class for l in n_pos.keys(): score_l = np.array(score[l]) match_l = np.array(match[l], dtype=np.int8) order = score_l.argsort()[::-1] match_l = match_l[order] tp = np.cumsum(match_l == 1) fp = np.cumsum(match_l == 0) # If an element of fp + tp is 0, # the corresponding element of prec[l] is nan. prec[l] = tp / (fp + tp) # If n_pos[l] is 0, rec[l] is None. if n_pos[l] > 0: rec[l] = tp / n_pos[l] return prec, rec
def test_rrpn_post_processor(image_tensor, targets_data): from maskrcnn_benchmark.modeling.rrpn.inference import make_rpn_postprocessor, REGRESSION_CN from maskrcnn_benchmark.modeling.rrpn.loss import make_rpn_loss_evaluator N, C, H, W = image_tensor.shape targets = [] for ix, td in enumerate(targets_data): rect_pts = convert_rect_to_pts2(td) #.reshape((len(td), 8)) nn = len(rect_pts) bboxes = np.zeros((nn, 4), dtype=np.float32) bboxes[:, :2] = np.min(rect_pts, axis=1) bboxes[:, 2:] = np.max(rect_pts, axis=1) boxlist = BoxList(bboxes, (W, H), mode="xyxy") mm = SegmentationMask(rect_pts.reshape(nn, 1, 8).tolist(), (W, H), mode='poly') boxlist.add_field("masks", mm) targets.append(boxlist) device = image_tensor.device USE_FPN = False cfg.MODEL.ROTATED = True CFG_RPN = cfg.MODEL.RPN CFG_RPN.ANCHOR_ANGLES = (-90, -54, -18, 18, 54) CFG_RPN.ANCHOR_SIZES = (48, 84, 128, 224) CFG_RPN.ANCHOR_STRIDE = (16, ) CFG_RPN.ASPECT_RATIOS = (1.0, 2.0) if USE_FPN: CFG_RPN.ANCHOR_STRIDE = tuple(np.array(CFG_RPN.ANCHOR_SIZES) // 8) CFG_RPN.POST_NMS_TOP_N_TRAIN = 100 image_list, feature_maps = get_image_list_and_feature_maps( image_tensor, CFG_RPN.ANCHOR_STRIDE) anchor_generator = make_rrpn_anchor_generator(cfg) num_anchors = anchor_generator.num_anchors_per_location() anchors = anchor_generator.forward(image_list, feature_maps) objectness = [] box_regression = [] for ix, fm in enumerate(feature_maps): n_anchors = num_anchors[ix] N, _, h, w = fm.shape objectness.append(torch.rand(N, n_anchors, h, w, device=device)) box_regression.append( torch.rand(N, n_anchors * REGRESSION_CN, h, w, device=device)) # train mode postprocessor_train = make_rpn_postprocessor(cfg, rpn_box_coder=None, is_train=True) postprocessor_train.train() # result = postprocessor_train.forward(anchors, objectness, box_regression, targets=targets) # check loss loss_evaluator = make_rpn_loss_evaluator(cfg, postprocessor_train.box_coder) loss_objectness, loss_rpn_box_reg = loss_evaluator(anchors, objectness, box_regression, targets) # test mode postprocessor_test = make_rpn_postprocessor(cfg, rpn_box_coder=None, is_train=False) postprocessor_test.eval() result = postprocessor_test.forward(anchors, objectness, box_regression)
def forward_for_single_feature_map(self, locations, box_cls, box_regression, centerness, image_sizes): """ Arguments: anchors: list[BoxList] box_cls: tensor of size N, A * C, H, W box_regression: tensor of size N, A * 4, H, W """ N, C, H, W = box_cls.shape # put in the same format as locations box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) box_cls = box_cls.reshape(N, -1, C).sigmoid() box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1) centerness = centerness.reshape(N, -1).sigmoid() candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) #print('pre_nms_top_n: ', pre_nms_top_n) # multiply the classification scores with centerness scores box_cls = box_cls * centerness[:, :, None] results = [] for i in range(N): per_box_cls = box_cls[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] + 1 per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) h, w = image_sizes[i] boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy") boxlist.add_field("labels", per_class) boxlist.add_field("scores", per_box_cls) boxlist = boxlist.clip_to_image(remove_empty=False) boxlist = remove_small_boxes(boxlist, self.min_size) results.append(boxlist) return results
def forward_for_single_feature_map(self, anchors, objectness, box_regression_left, box_regression_right): """ Arguments: anchors: list[BoxList] objectness: tensor of size N, A, H, W box_regression: tensor of size N, A * 4, H, W """ device = objectness.device N, A, H, W = objectness.shape # put in the same format as anchors objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) objectness = objectness.sigmoid() box_regression_left = permute_and_flatten(box_regression_left, N, A, 4, H, W) box_regression_right = permute_and_flatten(box_regression_right, N, A, 4, H, W) num_anchors = A * H * W pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) batch_idx = torch.arange(N, device=device)[:, None] box_regression_left = box_regression_left[batch_idx, topk_idx] box_regression_right = box_regression_right[batch_idx, topk_idx] image_shapes = [box.size for box in anchors] concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] proposals_left = self.box_coder.decode( box_regression_left.view(-1, 4), concat_anchors.view(-1, 4) ) proposals_right = self.box_coder.decode( box_regression_right.view(-1, 4), concat_anchors.view(-1, 4) ) proposals_left = proposals_left.view(N, -1, 4) proposals_right = proposals_right.view(N, -1, 4) result, result_right = [], [] for proposal_left, proposal_right, score, im_shape in zip(proposals_left, proposals_right, objectness, image_shapes): boxlist_left = BoxList(proposal_left, im_shape, mode="xyxy") boxlist_left.add_field("objectness", score) boxlist_left = boxlist_left.clip_to_image(remove_empty=False) boxlist_left = remove_small_boxes(boxlist_left, self.min_size) # MAY CAUSE RuntimeError if training is unstable: copy_if failed to synchronize: device-side assert triggered keep_idx_i_left = boxlist_nms_idx( boxlist_left, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) boxlist_right = BoxList(proposal_right, im_shape, mode="xyxy") boxlist_right.add_field("objectness", score) boxlist_right = boxlist_right.clip_to_image(remove_empty=False) boxlist_right = remove_small_boxes(boxlist_right, self.min_size) # MAY CAUSE RuntimeError if training is unstable: copy_if failed to synchronize: device-side assert triggered keep_idx_i_right = boxlist_nms_idx( boxlist_right, self.nms_thresh, max_proposals=self.post_nms_top_n, score_field="objectness", ) # TODO: optimize this! keep_idx_i = torch.from_numpy(np.intersect1d(keep_idx_i_left.cpu().numpy(), keep_idx_i_right.cpu().numpy())).cuda() boxlist_left = boxlist_left[keep_idx_i] boxlist_right = boxlist_right[keep_idx_i] result.append(boxlist_left) result_right.append(boxlist_right) return result, result_right
def forward(self, anchors, objectness, box_regression, targets=None, centerness=None, rpn_center_box_regression=None, centerness_pack=None): """ Arguments: anchors: list[list[BoxList]] objectness: list[tensor] box_regression: list[tensor] Returns: boxlists (list[BoxList]): the post-processed anchors, after applying box decoding and NMS """ sampled_boxes = [] num_levels = len(objectness) anchors = list(zip(*anchors)) for a, o, b in zip(anchors, objectness, box_regression): sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) boxlists = list(zip(*sampled_boxes)) boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] if num_levels > 1: boxlists = self.select_over_all_levels(boxlists) # append ground-truth bboxes to proposals if self.training and targets is not None: boxlists = self.add_gt_proposals(boxlists, targets) if self.pred_targets: pred_targets = [] if True: for img_centerness, center_box_reg in zip( centerness, rpn_center_box_regression): # gt_centerness, gt_bbox, anchor_bbox = center_target # print(rpn_center_box_regression, anchor_bbox) # gt_mask = gt_centerness.detach().cpu().numpy() > 0.0 img_centerness = img_centerness[0, :, :] center_box_reg = center_box_reg[:, :, :].permute(1, 2, 0) anchor_bbox = np.zeros(shape=(center_box_reg.shape[0], center_box_reg.shape[1], 4)) for xx in range(anchor_bbox.shape[1]): for yy in range(anchor_bbox.shape[0]): anchor_bbox[yy, xx, :] = [ max(0.0, xx * 4 - 16), max(0.0, yy * 4 - 16), min(xx * 4 + 16, boxlists[0].size[0]), min(yy * 4 + 16, boxlists[0].size[1]) ] anchor_bbox = torch.as_tensor(anchor_bbox, device=center_box_reg.device) # print(center_box_reg.shape, anchor_bbox.shape) boxes = self.box_coder.decode( center_box_reg.reshape(-1, 4), anchor_bbox.view(-1, 4)) pred_target = None pred_score = torch.sigmoid( img_centerness.detach()).cpu().numpy() pred_mask = pred_score > 0.95 # print(gt_mask.shape, pred_mask.shape) imllabel, numlabel = scipy.ndimage.label(pred_mask) if numlabel > 0: valid = np.zeros(shape=(numlabel, ), dtype=np.bool) box_inds = [] for ano in range(1, numlabel + 1): mask = imllabel == ano valid[ano - 1] = True # gt_mask[mask].sum() == 0 box_inds.append(np.argmax(pred_score * mask)) if np.any(valid): boxes = boxes[box_inds, :] # print(box_inds, boxes, anchor_bbox.view(-1, 4)[box_inds, :], gt_bbox.view(-1, 4)[box_inds, :]) pred_target = BoxList(torch.as_tensor(boxes), boxlists[0].size, mode="xyxy") pred_target.clip_to_image() pred_target = pred_target.to(img_centerness.device) # print(img_centerness.device, pred_target.bbox.device) pred_targets.append(pred_target) else: for img_centerness in centerness: pred_target = None pred_mask = torch.sigmoid( img_centerness[0, :, :].detach()).cpu().numpy() > 0.95 # print(gt_mask.shape, pred_mask.shape) imllabel, numlabel = scipy.ndimage.label(pred_mask) if numlabel > 0: masks = np.zeros(shape=(pred_mask.shape[0], pred_mask.shape[1], numlabel), dtype=np.uint8) valid = np.zeros(shape=(numlabel, ), dtype=np.bool) for ano in range(1, numlabel + 1): mask = imllabel == ano valid[ano - 1] = True masks[:, :, ano - 1] = mask if np.any(valid): masks = masks[:, :, valid] boxes = extract_bboxes(masks) pred_target = BoxList(torch.as_tensor(boxes), boxlists[0].size, mode="xyxy") pred_target.clip_to_image() pred_target = pred_target.to(img_centerness.device) # print(img_centerness.device, pred_target.bbox.device) pred_targets.append(pred_target) if True: if not self.training: print('add', [ len(pred_target) for pred_target in pred_targets if pred_target ], 'proposals') boxlists = self.add_pred_proposals(boxlists, pred_targets) else: pred_targets = None return boxlists, pred_targets
def to_image_list_synthesize_4(transposed_info, size_divisible=0): tensors = transposed_info[0] if isinstance(tensors, (tuple, list)): targets = transposed_info[1] img_ids = transposed_info[2] #synthesize data: assert len(tensors) % 4 == 0, \ 'len(tensor) % 4 != 0, could not be synthesized ! uneven' max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) # TODO Ideally, just remove this and let me model handle arbitrary # input sizs if size_divisible > 0: import math stride = size_divisible max_size = list(max_size) max_size[1] = int(math.ceil(max_size[1] / stride) * stride) max_size[2] = int(math.ceil(max_size[2] / stride) * stride) max_size = tuple(max_size) batch_shape = (len(tensors)//4,) + max_size syn_batched_imgs = tensors[0].new(*batch_shape).zero_() syn_targets = [] with torch.no_grad(): for idx, pad_img in enumerate(syn_batched_imgs): # currently suppose first w then h new_h, new_w = max_size[1]//2, max_size[2]//2 #NOTE: interpolate api require first h then w ! mode = 'nearest' topLeftImg = torch.nn.functional.interpolate(tensors[idx*4].unsqueeze(0),size=(new_h, new_w),mode=mode).squeeze(0) topRightImg = torch.nn.functional.interpolate(tensors[idx*4+1].unsqueeze(0),size=(new_h, new_w),mode=mode).squeeze(0) bottomLeftImg = torch.nn.functional.interpolate(tensors[idx*4+2].unsqueeze(0),size=(new_h, new_w),mode=mode).squeeze(0) bottomRightImg = torch.nn.functional.interpolate(tensors[idx*4+3].unsqueeze(0),size=(new_h, new_w),mode=mode).squeeze(0) c = topLeftImg.shape[0] assert c == topRightImg.shape[0] and c == bottomLeftImg.shape[0] and c == bottomRightImg.shape[0] pad_img[:c, :new_h, :new_w].copy_(topLeftImg) pad_img[:c, :new_h, new_w:].copy_(topRightImg) pad_img[:c, new_h:, :new_w].copy_(bottomLeftImg) pad_img[:c, new_h:, new_w:].copy_(bottomRightImg) # resize each of four sub-imgs into (new_h, new_w) scale # resize api require first w then h ! topLeftBL = targets[idx*4].resize((new_w, new_h)) topRightBL = targets[idx*4+1].resize((new_w, new_h)) bottomLeftBL = targets[idx*4+2].resize((new_w, new_h)) bottomRightBL = targets[idx*4+3].resize((new_w, new_h)) assert topLeftBL.mode == 'xyxy' offsets = [torch.Tensor([0.0,0.0,0.0,0.0]), torch.Tensor([new_w,0.0,new_w,0.0]), torch.Tensor([0.0,new_h,0.0,new_h]),torch.Tensor([new_w,new_h,new_w,new_h])] # append offsets to box coordinates except for topLeftBL syn_bbox = torch.cat( (topLeftBL.bbox + offsets[0], topRightBL.bbox + offsets[1], bottomLeftBL.bbox + offsets[2], bottomRightBL.bbox + offsets[3]), dim=0) #NOTE: BoxList initialization require first w then h tmp_BoxList = BoxList(syn_bbox, (new_w*2, new_h*2), mode='xyxy') tmp_BoxList.add_field('labels', torch.cat((topLeftBL.extra_fields['labels'], topRightBL.extra_fields['labels'], bottomLeftBL.extra_fields['labels'], bottomRightBL.extra_fields['labels']), dim=-1)) #NOTE: adjust the targets mask topLeftPoly = [poly.polygons[0] for poly in topLeftBL.extra_fields['masks'].instances.polygons] topRightPoly = [poly.polygons[0] for poly in topRightBL.extra_fields['masks'].instances.polygons] bottomLeftPoly = [poly.polygons[0] for poly in bottomLeftBL.extra_fields['masks'].instances.polygons] bottomRightPoly = [poly.polygons[0] for poly in bottomRightBL.extra_fields['masks'].instances.polygons] offsets = [[0.0,0.0], [new_w,0.0], [0.0,new_h], [new_w,new_h]] syn_mask = [[list(np.array(poly)+np.array(offsets[0]*int(len(poly)/2)))] for poly in topLeftPoly] + \ [[list(np.array(poly)+np.array(offsets[1]*int(len(poly)/2)))] for poly in topRightPoly] + \ [[list(np.array(poly)+np.array(offsets[2]*int(len(poly)/2)))] for poly in bottomLeftPoly] + \ [[list(np.array(poly)+np.array(offsets[3]*int(len(poly)/2)))] for poly in bottomRightPoly] syn_mask = SegmentationMask(syn_mask, (new_w*2, new_h*2), mode='poly') tmp_BoxList.add_field('masks', syn_mask) # append a four-to-one BoxList object syn_targets.append(tmp_BoxList) syn_targets = tuple(syn_targets) assert len(img_ids)%4==0 #since images are synthesized, id is meaningless, substitute with -1 syn_img_ids = tuple([-1]*(len(syn_targets))) syn_image_sizes = [list(max_size)[-2:] for i in range(batch_shape[0])] return ImageList(syn_batched_imgs, syn_image_sizes), syn_targets, syn_img_ids else: raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors)))