def process_panoptic_prediction(panoptic_pred, num_stuff, idx, img_size, original_size): # Extract panoptic prediction msk_pred, cat_pred, obj_pred, iscrowd_pred = panoptic_pred bbx_pred = extract_boxes(msk_pred, cat_pred.numel()) # Convert bbx and redo clamping bbx_pred[:, [0, 2]] = (bbx_pred[:, [0, 2]] / img_size[0] * original_size[0]).clamp(min=0, max=original_size[0]) bbx_pred[:, [1, 3]] = (bbx_pred[:, [1, 3]] / img_size[1] * original_size[1]).clamp(min=0, max=original_size[1]) bbx_pred_size = bbx_pred[:, 2:] - bbx_pred[:, :2] outs = [] for i, (obj_i, cat_i, bbx_i, iscrowd_i, bbx_size_i) in enumerate(zip( obj_pred, cat_pred, bbx_pred, iscrowd_pred, bbx_pred_size)): if iscrowd_i.item() == 1 or cat_i.item() < num_stuff or cat_i.item() == 255: continue out = dict(image_id=idx, category_id=int(cat_i.item()), score=float(obj_i.item())) out["bbox"] = [ float(bbx_i[1].item()), float(bbx_i[0].item()), float(bbx_size_i[1].item()), float(bbx_size_i[0].item()), ] segmentation = msk_pred == i segmentation = Image.fromarray(segmentation.numpy()).resize(original_size[::-1], Image.NEAREST) out["segmentation"] = mask_encode(np.asfortranarray(np.array(segmentation))) out["segmentation"]["counts"] = str(out["segmentation"]["counts"], "utf-8") outs.append(out) return outs
def im_post(boxes_all, masks_all, scores, pred_boxes, pred_masks, cls_inds, num_classes, im_info): cls_segms = [[] for _ in range(num_classes)] mask_ind = 0 M = config.network.mask_size scale = (M + 2.0) / M ref_boxes = expand_boxes(pred_boxes, scale) ref_boxes = ref_boxes.astype(np.int32) padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32) for idx in range(1, num_classes): segms = [] cls_boxes = np.hstack([ pred_boxes[idx == cls_inds, :], scores.reshape(-1, 1)[idx == cls_inds] ]) cls_pred_masks = pred_masks[idx == cls_inds] cls_ref_boxes = ref_boxes[idx == cls_inds] for _ in range(cls_boxes.shape[0]): if pred_masks.shape[1] > 1: padded_mask[1:-1, 1:-1] = cls_pred_masks[_, idx, :, :] else: padded_mask[1:-1, 1:-1] = cls_pred_masks[_, 0, :, :] ref_box = cls_ref_boxes[_, :] w = ref_box[2] - ref_box[0] + 1 h = ref_box[3] - ref_box[1] + 1 w = np.maximum(w, 1) h = np.maximum(h, 1) mask = cv2.resize(padded_mask, (w, h)) mask = np.array(mask > 0.5, dtype=np.uint8) im_mask = np.zeros((im_info[0], im_info[1]), dtype=np.uint8) x_0 = max(ref_box[0], 0) x_1 = min(ref_box[2] + 1, im_info[1]) y_0 = max(ref_box[1], 0) y_1 = min(ref_box[3] + 1, im_info[0]) im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (x_0 - ref_box[0]):(x_1 - ref_box[0])] # Get RLE encoding used by the COCO evaluation API rle = mask_encode(np.array(im_mask[:, :, np.newaxis], order='F'))[0] rle['counts'] = rle['counts'].decode() segms.append(rle) mask_ind += 1 cls_segms[idx] = segms boxes_all[idx].append(cls_boxes) masks_all[idx].append(segms)
def process_prediction(bbx_pred, cls_pred, obj_pred, msk_pred, img_size, idx, original_size): # Move everything to CPU bbx_pred, cls_pred, obj_pred = (t.cpu() for t in (bbx_pred, cls_pred, obj_pred)) msk_pred = msk_pred.cpu() if msk_pred is not None else None if msk_pred is not None: if isinstance(msk_pred, torch.Tensor): # ROI-stile prediction bbx_inv = invert_roi_bbx(bbx_pred, list(msk_pred.shape[-2:]), list(img_size)) bbx_idx = torch.arange(0, msk_pred.size(0), dtype=torch.long) msk_pred = roi_sampling(msk_pred.unsqueeze(1).sigmoid(), bbx_inv, bbx_idx, list(img_size), padding="zero") msk_pred = msk_pred.squeeze(1) > 0.5 elif isinstance(msk_pred, PackedSequence): # Seeds-style prediction msk_pred.data = msk_pred.data > 0.5 msk_pred_exp = msk_pred.data.new_zeros(len(msk_pred), img_size[0], img_size[1]) for it, (msk_pred_i, bbx_pred_i) in enumerate(zip(msk_pred, bbx_pred)): i, j = int(bbx_pred_i[0].item()), int(bbx_pred_i[1].item()) msk_pred_exp[it, i:i + msk_pred_i.size(0), j:j + msk_pred_i.size(1)] = msk_pred_i msk_pred = msk_pred_exp # Convert bbx and redo clamping bbx_pred[:, [0, 2]] = (bbx_pred[:, [0, 2]] / img_size[0] * original_size[0]).clamp(min=0, max=original_size[0]) bbx_pred[:, [1, 3]] = (bbx_pred[:, [1, 3]] / img_size[1] * original_size[1]).clamp(min=0, max=original_size[1]) bbx_pred_size = bbx_pred[:, 2:] - bbx_pred[:, :2] outs = [] for i, (bbx_pred_i, bbx_pred_size_i, cls_pred_i, obj_pred_i) in \ enumerate(zip(bbx_pred, bbx_pred_size, cls_pred, obj_pred)): out = dict(image_id=idx, category_id=int(cls_pred_i.item()), score=float(obj_pred_i.item())) out["bbox"] = [ float(bbx_pred_i[1].item()), float(bbx_pred_i[0].item()), float(bbx_pred_size_i[1].item()), float(bbx_pred_size_i[0].item()), ] # Expand and convert mask if present if msk_pred is not None: segmentation = Image.fromarray(msk_pred[i].numpy()).resize(original_size[::-1], Image.NEAREST) out["segmentation"] = mask_encode(np.asfortranarray(np.array(segmentation))) out["segmentation"]["counts"] = str(out["segmentation"]["counts"], "utf-8") outs.append(out) return outs