def _checkIgnore(dt, iregion): if iregion is None: return True bb = np.array(dt['bbox']).astype(np.int) x1,y1,x2,y2 = bb[0],bb[1],bb[0]+bb[2],bb[1]+bb[3] x2 = min([x2,iregion.shape[1]]) y2 = min([y2,iregion.shape[0]]) if bb[2]* bb[3] == 0: return False crop_iregion = iregion[y1:y2, x1:x2] if crop_iregion.sum() == 0: return True if not 'uv' in dt.keys(): # filtering boxes return crop_iregion.sum()/bb[2]/bb[3] < self.ignoreThrBB # filtering UVs ignoremask = np.require(crop_iregion, requirements=['F']) uvmask = np.require(np.asarray(dt['uv'][0]>0), dtype = np.uint8, requirements=['F']) uvmask_ = maskUtils.encode(uvmask) ignoremask_ = maskUtils.encode(ignoremask) uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0] return uviou < self.ignoreThrUV
def build_coco_results(dataset, image_ids, rois, class_ids, scores, masks): """Arrange resutls to match COCO specs in http://cocodataset.org/#format """ # If no results, return an empty list if rois is None: return [] results = [] for image_id in image_ids: # Loop through detections for i in range(rois.shape[0]): class_id = class_ids[i] score = scores[i] bbox = np.around(rois[i], 1) mask = masks[:, :, i] result = { "image_id": image_id, "category_id": dataset.get_source_class_id(class_id, "coco"), "bbox": [bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]], "score": score, "segmentation": maskUtils.encode(np.asfortranarray(mask)) } results.append(result) return results
def crop_mask(boxes,segmentations,flipped, imsize): assert (boxes.shape[0]==len(segmentations)) psegmentations=[] for i in xrange(len(segmentations)): gts=segmentations[i] box=boxes[i,:] if type(gts) == list and gts: assert (type(gts[0]) != dict) prle= mask.frPyObjects(gts,imsize[1],imsize[0]) elif type(gts) == dict and type(gts['counts']) == list: prle= mask.frPyObjects([gts],imsize[1],imsize[0]) elif type(gts) == dict and \ type(gts['counts'] == unicode or type(gts['counts']) == str): prle = [gts] else: print '{} box has no segmentation'.format(i) psegmentations.append([]) continue if len(prle)==1: prle=prle[0] else: prle= mask.merge(prle) pmask=mask.decode([prle]) if flipped: pmask=pmask[:,::-1,:] pmask=np.copy(pmask[box[1]:box[3],box[0]:box[2],:],order='F') psegmentations.append(mask.encode(pmask)) return psegmentations
def segm_results(cls_boxes, masks, ref_boxes, im_h, im_w, num_classes=81, M=14, # cfg.MRCNN.RESOLUTION cls_specific_mask=True, thresh_binarize=0.5): cls_segms = [[] for _ in range(num_classes)] mask_ind = 0 # To work around an issue with cv2.resize (it seems to automatically pad # with repeated border values), we manually zero-pad the masks by 1 pixel # prior to resizing back to the original image resolution. This prevents # "top hat" artifacts. We therefore need to expand the reference boxes by an # appropriate factor. scale = (M + 2.0) / M ref_boxes = box_utils.expand_boxes(ref_boxes, scale) ref_boxes = ref_boxes.astype(np.int32) padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32) # skip j = 0, because it's the background class for j in range(1, num_classes): segms = [] for _ in range(cls_boxes[j].shape[0]): if cls_specific_mask: padded_mask[1:-1, 1:-1] = masks[mask_ind, j, :, :] else: padded_mask[1:-1, 1:-1] = masks[mask_ind, 0, :, :] ref_box = ref_boxes[mask_ind, :] w = ref_box[2] - ref_box[0] + 1 h = ref_box[3] - ref_box[1] + 1 w = np.maximum(w, 1) h = np.maximum(h, 1) mask = cv2.resize(padded_mask, (w, h)) mask = np.array(mask > thresh_binarize, dtype=np.uint8) im_mask = np.zeros((im_h, im_w), dtype=np.uint8) x_0 = max(ref_box[0], 0) x_1 = min(ref_box[2] + 1, im_w) y_0 = max(ref_box[1], 0) y_1 = min(ref_box[3] + 1, im_h) im_mask[y_0:y_1, x_0:x_1] = mask[ (y_0 - ref_box[1]):(y_1 - ref_box[1]), (x_0 - ref_box[0]):(x_1 - ref_box[0]) ] # Get RLE encoding used by the COCO evaluation API rle = mask_util.encode( np.array(im_mask[:, :, np.newaxis], order='F') )[0] rle['counts'] = rle['counts'].decode() # convert back to str so that it can be later saved to json segms.append(rle) mask_ind += 1 cls_segms[j] = segms assert mask_ind == masks.shape[0] return cls_segms
def _flip_rle(rle, height, width): if 'counts' in rle and type(rle['counts']) == list: # Magic RLE format handling painfully discovered by looking at the # COCO API showAnns function. rle = mask_util.frPyObjects([rle], height, width) mask = mask_util.decode(rle) mask = mask[:, ::-1, :] rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle
def segm_results(cls_boxes, masks, ref_boxes, im_h, im_w): num_classes = cfg.MODEL.NUM_CLASSES cls_segms = [[] for _ in range(num_classes)] mask_ind = 0 # To work around an issue with cv2.resize (it seems to automatically pad # with repeated border values), we manually zero-pad the masks by 1 pixel # prior to resizing back to the original image resolution. This prevents # "top hat" artifacts. We therefore need to expand the reference boxes by an # appropriate factor. M = cfg.MRCNN.RESOLUTION scale = (M + 2.0) / M ref_boxes = box_utils.expand_boxes(ref_boxes, scale) ref_boxes = ref_boxes.astype(np.int32) padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32) # skip j = 0, because it's the background class for j in range(1, num_classes): segms = [] for _ in range(cls_boxes[j].shape[0]): if cfg.MRCNN.CLS_SPECIFIC_MASK: padded_mask[1:-1, 1:-1] = masks[mask_ind, j, :, :] else: padded_mask[1:-1, 1:-1] = masks[mask_ind, 0, :, :] ref_box = ref_boxes[mask_ind, :] w = (ref_box[2] - ref_box[0] + 1) h = (ref_box[3] - ref_box[1] + 1) w = np.maximum(w, 1) h = np.maximum(h, 1) mask = cv2.resize(padded_mask, (w, h)) mask = np.array(mask > cfg.MRCNN.THRESH_BINARIZE, dtype=np.uint8) im_mask = np.zeros((im_h, im_w), dtype=np.uint8) x_0 = max(ref_box[0], 0) x_1 = min(ref_box[2] + 1, im_w) y_0 = max(ref_box[1], 0) y_1 = min(ref_box[3] + 1, im_h) im_mask[y_0:y_1, x_0:x_1] = mask[ (y_0 - ref_box[1]):(y_1 - ref_box[1]), (x_0 - ref_box[0]):(x_1 - ref_box[0])] # Get RLE encoding used by the COCO evaluation API rle = mask_util.encode(np.array(im_mask[:, :, np.newaxis], order='F'))[0] # For dumping to json, need to decode the byte string. # https://github.com/cocodataset/cocoapi/issues/70 rle['counts'] = rle['counts'].decode('ascii') segms.append(rle) mask_ind += 1 cls_segms[j] = segms assert mask_ind == masks.shape[0] return cls_segms
def _RleCompress(masks): """Compresses mask using Run-length encoding provided by pycocotools. Args: masks: uint8 numpy array of shape [mask_height, mask_width] with values in {0, 1}. Returns: A pycocotools Run-length encoding of the mask. """ return mask.encode(np.asfortranarray(masks))
def convert(args): data_dict = json.load(open(args.imgCatIdsFile, 'r')) img2id = {x['file_name']: x['id'] for x in data_dict['images']} img2info = {x['file_name']: x for x in data_dict['images']} categories = data_dict['categories'] images = [] images_unique = set() annotations = [] ann_id = 0 # loop over annotation files files_ann = sorted(glob.glob(os.path.join(args.ann_dir, '*.png'))) for i, file_ann in enumerate(files_ann): if i % 50 == 0: print('#files processed: {}'.format(i)) file_name = os.path.basename(file_ann).replace('.png', '.jpg') img_id = img2id[file_name] if file_name not in images_unique: images_unique.add(file_name) images.append(img2info[file_name]) ann_mask = imread(file_ann) Om = ann_mask[:, :, 0] Oi = ann_mask[:, :, 1] # loop over instances for instIdx in np.unique(Oi): if instIdx == 0: continue imask = (Oi == instIdx) cat_id = Om[imask][0] # RLE encoding rle = COCOmask.encode(np.asfortranarray(imask.astype(np.uint8))) ann = {} ann['id'] = ann_id ann_id += 1 ann['image_id'] = img_id ann['segmentation'] = rle ann['category_id'] = int(cat_id) ann['iscrowd'] = 0 ann['area'] = np.sum(imask) annotations.append(ann) # data_dict['annotations'] = annotations print('#files: {}, #instances: {}'.format(len(files_ann), len(annotations))) data_out = {'categories': categories, 'images': images, 'annotations': annotations} with open(args.output_json, 'w') as f: json.dump(data_out, f)
def prepare_for_coco_segmentation(predictions, dataset): import pycocotools.mask as mask_util import numpy as np masker = Masker(threshold=0.5, padding=1) # assert isinstance(dataset, COCODataset) coco_results = [] for image_id, prediction in tqdm(enumerate(predictions)): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue # TODO replace with get_img_info? image_width = dataset.coco.imgs[original_id]["width"] image_height = dataset.coco.imgs[original_id]["height"] prediction = prediction.resize((image_width, image_height)) masks = prediction.get_field("mask") # t = time.time() masks = masker(masks, prediction) # logger.info('Time mask: {}'.format(time.time() - t)) # prediction = prediction.convert('xywh') # boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() # rles = prediction.get_field('mask') rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] coco_results.extend( [ { "image_id": original_id, "category_id": mapped_labels[k], "segmentation": rle, "score": scores[k], } for k, rle in enumerate(rles) ] ) return coco_results
def eval_coco(df, detect_func, tqdm_bar=None): """ Args: df: a DataFlow which produces (image, image_id) detect_func: a callable, takes [image] and returns [DetectionResult] tqdm_bar: a tqdm object to be shared among multiple evaluation instances. If None, will create a new one. Returns: list of dict, to be dumped to COCO json format """ df.reset_state() all_results = [] # tqdm is not quite thread-safe: https://github.com/tqdm/tqdm/issues/323 with ExitStack() as stack: if tqdm_bar is None: tqdm_bar = stack.enter_context( tqdm.tqdm(total=df.size(), **get_tqdm_kwargs())) for img, img_id in df: results = detect_func(img) for r in results: box = r.box cat_id = COCOMeta.class_id_to_category_id[r.class_id] box[2] -= box[0] box[3] -= box[1] res = { 'image_id': img_id, 'category_id': cat_id, 'bbox': list(map(lambda x: round(float(x), 3), box)), 'score': round(float(r.score), 4), } # also append segmentation to results if r.mask is not None: rle = cocomask.encode( np.array(r.mask[:, :, None], order='F'))[0] rle['counts'] = rle['counts'].decode('ascii') res['segmentation'] = rle all_results.append(res) tqdm_bar.update(1) return all_results
def eval_coco(df, detect_func): """ Args: df: a DataFlow which produces (image, image_id) detect_func: a callable, takes [image] and returns [DetectionResult] Returns: list of dict, to be dumped to COCO json format """ df.reset_state() all_results = [] with tqdm.tqdm(total=df.size(), **get_tqdm_kwargs()) as pbar: for img, img_id in df.get_data(): results = detect_func(img) for r in results: box = r.box cat_id = COCOMeta.class_id_to_category_id[r.class_id] box[2] -= box[0] box[3] -= box[1] res = { 'image_id': img_id, 'category_id': cat_id, 'bbox': list(map(lambda x: float(round(x, 1)), box)), 'score': float(round(r.score, 2)), } # also append segmentation to results if r.mask is not None: rle = cocomask.encode( np.array(r.mask[:, :, None], order='F'))[0] rle['counts'] = rle['counts'].decode('ascii') res['segmentation'] = rle all_results.append(res) pbar.update(1) return all_results
def rle_mask_voting( top_masks, all_masks, all_dets, iou_thresh, binarize_thresh, method='AVG' ): """Returns new masks (in correspondence with `top_masks`) by combining multiple overlapping masks coming from the pool of `all_masks`. Two methods for combining masks are supported: 'AVG' uses a weighted average of overlapping mask pixels; 'UNION' takes the union of all mask pixels. """ if len(top_masks) == 0: return all_not_crowd = [False] * len(all_masks) top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd) decoded_all_masks = [ np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks ] decoded_top_masks = [ np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks ] all_boxes = all_dets[:, :4].astype(np.int32) all_scores = all_dets[:, 4] # Fill box support with weights mask_shape = decoded_all_masks[0].shape mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1])) for k in range(len(all_masks)): ref_box = all_boxes[k] x_0 = max(ref_box[0], 0) x_1 = min(ref_box[2] + 1, mask_shape[1]) y_0 = max(ref_box[1], 0) y_1 = min(ref_box[3] + 1, mask_shape[0]) mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k] mask_weights = np.maximum(mask_weights, 1e-5) top_segms_out = [] for k in range(len(top_masks)): # Corner case of empty mask if decoded_top_masks[k].sum() == 0: top_segms_out.append(top_masks[k]) continue inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0] # Only matches itself if len(inds_to_vote) == 1: top_segms_out.append(top_masks[k]) continue masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote] if method == 'AVG': ws = mask_weights[inds_to_vote] soft_mask = np.average(masks_to_vote, axis=0, weights=ws) mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8) elif method == 'UNION': # Any pixel that's on joins the mask soft_mask = np.sum(masks_to_vote, axis=0) mask = np.array(soft_mask > 1e-5, dtype=np.uint8) else: raise NotImplementedError('Method {} is unknown'.format(method)) rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] top_segms_out.append(rle) return top_segms_out
def get_bboxes_single(self, cls_scores, bbox_preds, centernesses, cof_preds, feat_mask, mlvl_points, img_shape, ori_shape, scale_factor, cfg, rescale=False): assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_bboxes = [] mlvl_scores = [] mlvl_centerness = [] mlvl_cofs = [] for cls_score, bbox_pred, cof_pred, centerness, points in zip( cls_scores, bbox_preds, cof_preds, centernesses, mlvl_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) cof_pred = cof_pred.permute(1, 2, 0).reshape(-1, 32 * 4) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: max_scores, _ = (scores * centerness[:, None]).max(dim=1) _, topk_inds = max_scores.topk(nms_pre) points = points[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] cof_pred = cof_pred[topk_inds, :] scores = scores[topk_inds, :] centerness = centerness[topk_inds] bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape) mlvl_cofs.append(cof_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_centerness.append(centerness) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_cofs = torch.cat(mlvl_cofs) if rescale: mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) mlvl_scores = torch.cat(mlvl_scores) padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) mlvl_centerness = torch.cat(mlvl_centerness) if self.ssd_flag is False: det_bboxes, det_labels, idxs_keep = multiclass_nms_idx( mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img, score_factors=mlvl_centerness) else: mlvl_scores = mlvl_scores * mlvl_centerness.view(-1, 1) det_bboxes, det_labels, det_cofs = self.fast_nms( mlvl_bboxes, mlvl_scores[:, 1:].transpose(1, 0).contiguous(), mlvl_cofs, iou_threshold=cfg.nms.iou_thr, score_thr=cfg.score_thr) cls_segms = [[] for _ in range(self.num_classes - 1)] mask_scores = [[] for _ in range(self.num_classes - 1)] if det_bboxes.shape[0] > 0: scale = 2 if self.ssd_flag is False: det_cofs = mlvl_cofs[idxs_keep] #####spp######################## img_mask1 = feat_mask.permute(1, 2, 0) pos_masks00 = torch.sigmoid(img_mask1 @ det_cofs[:, 0:32].t()) pos_masks01 = torch.sigmoid(img_mask1 @ det_cofs[:, 32:64].t()) pos_masks10 = torch.sigmoid(img_mask1 @ det_cofs[:, 64:96].t()) pos_masks11 = torch.sigmoid(img_mask1 @ det_cofs[:, 96:128].t()) pos_masks = torch.stack( [pos_masks00, pos_masks01, pos_masks10, pos_masks11], dim=0) if rescale == None: scale_factor = scale_factor * 0 + 1.0 pos_masks = self.crop_cuda( pos_masks, det_bboxes[:, :4] * det_bboxes.new_tensor(scale_factor) / scale) # pos_masks = crop_split(pos_masks00, pos_masks01, pos_masks10, pos_masks11, # det_bboxes * det_bboxes.new_tensor(scale_factor) / scale) pos_masks = pos_masks.permute(2, 0, 1) # masks = F.interpolate(pos_masks.unsqueeze(0), scale_factor=scale/scale_factor, mode='bilinear', align_corners=False).squeeze(0) if self.ssd_flag: masks = F.interpolate(pos_masks.unsqueeze(0), scale_factor=scale / scale_factor[3:1:-1], mode='bilinear', align_corners=False).squeeze(0) else: masks = F.interpolate(pos_masks.unsqueeze(0), scale_factor=scale / scale_factor, mode='bilinear', align_corners=False).squeeze(0) masks.gt_(0.4) if self.rescoring_flag: pred_iou = pos_masks.unsqueeze(1) pred_iou = self.convs_scoring(pred_iou) pred_iou = self.relu(self.mask_scoring(pred_iou)) pred_iou = F.max_pool2d( pred_iou, kernel_size=pred_iou.size()[2:]).squeeze(-1).squeeze(-1) pred_iou = pred_iou[range(pred_iou.size(0)), det_labels].squeeze() mask_scores = pred_iou * det_bboxes[:, -1] mask_scores = mask_scores.cpu().numpy() mask_scores = [ mask_scores[det_labels.cpu().numpy() == i] for i in range(self.num_classes - 1) ] for i in range(det_bboxes.shape[0]): label = det_labels[i] mask = masks[i].cpu().numpy() if rescale: im_mask = np.zeros((ori_shape[0], ori_shape[1]), dtype=np.uint8) shape = np.minimum(mask.shape, ori_shape[0:2]) else: im_mask = np.zeros((img_shape[0], img_shape[1]), dtype=np.uint8) shape = np.minimum(mask.shape, img_shape[0:2]) im_mask[:shape[0], :shape[1]] = mask[:shape[0], :shape[1]] rle = mask_util.encode( np.array(im_mask[:, :, np.newaxis], order='F'))[0] cls_segms[label].append(rle) if self.rescoring_flag: return det_bboxes, det_labels, (cls_segms, mask_scores) else: return det_bboxes, det_labels, cls_segms
def convert_predictions_to_coco_annotations(predictions): """Converts a batch of predictions to annotations in COCO format. Args: predictions: a dictionary of lists of numpy arrays including the following fields. K below denotes the maximum number of instances per image. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - num_detections: a list of numpy arrays of int of shape [batch_size]. - detection_boxes: a list of numpy arrays of float of shape [batch_size, K, 4], where coordinates are in the original image space (not the scaled image space). - detection_classes: a list of numpy arrays of int of shape [batch_size, K]. - detection_scores: a list of numpy arrays of float of shape [batch_size, K]. Optional fields: - detection_masks: a list of numpy arrays of float of shape [batch_size, K, mask_height, mask_width]. Returns: coco_predictions: prediction in COCO annotation format. """ coco_predictions = [] num_batches = len(predictions['source_id']) batch_size = predictions['source_id'][0].shape[0] max_num_detections = predictions['detection_classes'][0].shape[1] use_outer_box = 'detection_outer_boxes' in predictions for i in range(num_batches): predictions['detection_boxes'][i] = box_utils.yxyx_to_xywh( predictions['detection_boxes'][i]) if use_outer_box: predictions['detection_outer_boxes'][i] = box_utils.yxyx_to_xywh( predictions['detection_outer_boxes'][i]) mask_boxes = predictions['detection_outer_boxes'] else: mask_boxes = predictions['detection_boxes'] for j in range(batch_size): if 'detection_masks' in predictions: image_masks = mask_utils.paste_instance_masks( predictions['detection_masks'][i][j], mask_boxes[i][j], int(predictions['image_info'][i][j, 0, 0]), int(predictions['image_info'][i][j, 0, 1])) binary_masks = (image_masks > 0.0).astype(np.uint8) encoded_masks = [ mask_api.encode(np.asfortranarray(binary_mask)) for binary_mask in list(binary_masks) ] for k in range(max_num_detections): ann = {} ann['image_id'] = predictions['source_id'][i][j] ann['category_id'] = predictions['detection_classes'][i][j, k] ann['bbox'] = predictions['detection_boxes'][i][j, k] ann['score'] = predictions['detection_scores'][i][j, k] if 'detection_masks' in predictions: ann['segmentation'] = encoded_masks[k] coco_predictions.append(ann) for i, ann in enumerate(coco_predictions): ann['id'] = i + 1 return coco_predictions
_image = cv2.imread(sample.filepath) image = func.to_tensor(image).to(device) c, h, w = image.shape # Perform inference preds = segmentor.predictor(_image) instances = preds["instances"] boxes = instances.pred_boxes.tensor.numpy() boxes = boxes.tolist() scores = instances.scores.tolist() labels = instances.pred_classes.tolist() has_mask = instances.has("pred_masks") if has_mask: rles = [ mask_util.encode( np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") # Convert detections to FiftyOne format detections = [] for label, score, box in zip(labels, scores, boxes): # Convert to [top-left-x, top-left-y, width, height] # in relative coordinates in [0, 1] x [0, 1] x1, y1, x2, y2 = box rel_box = [x1 / w, y1 / h, (x2 - x1) / w, (y2 - y1) / h] detections.append( fo.Detection(label=classes[label],
def test_net(net, imdb, weights_filename, max_per_image=100, thresh=0.): np.random.seed(cfg.RNG_SEED) """Test a Fast R-CNN network on an image database.""" num_images = len(imdb.image_index) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) all_boxes = [[[] for _ in range(num_images)] for _ in range(imdb.num_classes)] # all_rles[cls][image] = [rle] array of N rles all_rles = [[[] for _ in range(num_images)] for _ in range(imdb.num_classes)] output_dir = get_output_dir(imdb, weights_filename) # timers _t = {'im_detect' : Timer(), 'misc' : Timer()} for i in range(num_images): im = cv2.imread(imdb.image_path_at(i)) _t['im_detect'].tic() scores, boxes, net_conv, im_scale = im_detect(net, im) # (n, 81), (n, 81*4), (n, 1024, H, W), float _t['im_detect'].toc() _t['misc'].tic() # skip j = 0, because it's the background class for j in range(1, imdb.num_classes): inds = np.where(scores[:, j] > thresh)[0] cls_scores = scores[inds, j] cls_boxes = boxes[inds, j*4:(j+1)*4] cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(torch.from_numpy(cls_dets), cfg.TEST.NMS).numpy() if cls_dets.size > 0 else [] cls_dets = cls_dets[keep, :] all_boxes[j][i] = cls_dets # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack([all_boxes[j][i][:, -1] for j in range(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in range(1, imdb.num_classes): keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] # run mask branch on all_boxes[:][i] accumulated_boxes = [] accumulated_labels = [] for j in range(1, imdb.num_classes): if all_boxes[j][i].shape[0] > 0: accumulated_boxes += [all_boxes[j][i][:, :4]] accumulated_labels += [j]*all_boxes[j][i].shape[0] accumulated_boxes = np.vstack(accumulated_boxes) # acculuate max_per_image boxes [xyxy] accumulated_labels = np.array(accumulated_labels, dtype=np.uint8) # n category labels mask_prob = net._predict_masks_from_boxes_and_labels(net_conv, accumulated_boxes * im_scale, # scaled boxes [xyxy] accumulated_labels) # (n, 14, 14) mask_prob = mask_prob.data.cpu().numpy() # convert to numpy masks = recover_masks(mask_prob, accumulated_boxes, im.shape[0], im.shape[1]) # (n, ih, iw) uint8 [0,1] masks = (masks > 122.).astype(np.uint8) # (n, ih, iw) uint8 [0,1] original size # add to all_rles rles = [COCOmask.encode(np.asfortranarray(m)) for m in masks] ri = 0 for j in range(1, imdb.num_classes): ri_next = ri+all_boxes[j][i].shape[0] all_rles[j][i] = rles[ri:ri_next] assert len(all_rles[j][i]) == all_boxes[j][i].shape[0] ri = ri_next _t['misc'].toc() print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \ .format(i + 1, num_images, _t['im_detect'].average_time(), _t['misc'].average_time())) det_file = os.path.join(output_dir, 'detections.pkl') with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') imdb.evaluate_detections(all_boxes, all_rles, output_dir)
def encode_mask(mask): """Convert mask to coco rle""" rle = cocomask.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] rle['counts'] = rle['counts'].decode('ascii') return rle
def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): """Convernt groundtruths to the dataset in COCO format. Args: groundtruths: a dictionary of numpy arrays including the fields below. See also different parsers under `../dataloader` for more details. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - image_info: a list of numpy arrays of float of shape [batch_size, 4, 2]. - num_detections: a list of numpy arrays of int of shape [batch_size]. - boxes: a list of numpy arrays of float of shape [batch_size, K, 4]. - classes: a list of numpy arrays of int of shape [batch_size, K]. Optional fields: - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If th field is absent, it is assumed that this instance is not crowd. - areas: a list of numy arrays of float of shape [batch_size, K]. If the field is absent, the area is calculated using either boxes or masks depending on which one is available. - masks: a list of numpy arrays of float of shape [batch_size, K, mask_height, mask_width], label_map: (optional) a dictionary that defines items from the category id to the category name. If `None`, collect the category mappping from the `groundtruths`. Returns: coco_groundtruths: the groundtruth dataset in COCO format. """ image_size = np.concatenate(groundtruths['image_info'], axis=0)[:, 0, :] source_id = np.concatenate(groundtruths['source_id'], axis=0) gt_images = [{'id': i, 'height': h, 'width': w} for i, h, w in zip(source_id, image_size[:, 0], image_size[:, 1])] for k in groundtruths: groundtruths[k] = np.stack(groundtruths[k], axis=0) num_batches = groundtruths['source_id'].shape[0] batch_size = groundtruths['source_id'].shape[1] boxes_ymin = groundtruths['boxes'][:, :, :, 0] boxes_xmin = groundtruths['boxes'][:, :, :, 1] boxes_width = (groundtruths['boxes'][:, :, :, 3] - groundtruths['boxes'][:, :, :, 1]) boxes_height = (groundtruths['boxes'][:, :, :, 2] - groundtruths['boxes'][:, :, :, 0]) groundtruths['boxes'] = np.stack( [boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=3) gt_annotations = [] for b in range(num_batches): for k in range(batch_size): if 'masks' in groundtruths: encoded_mask = [ mask_utils.encode(np.asfortranarray(instance_mask.astype(np.uint8))) for instance_mask in list(groundtruths['masks'][b, k])] for i in range(groundtruths['num_detections'][b, k]): ann = {} ann['image_id'] = groundtruths['source_id'][b, k] if 'is_crowds' in groundtruths: ann['iscrowd'] = groundtruths['is_crowds'][b, k, i] else: ann['iscrowd'] = 0 ann['category_id'] = groundtruths['classes'][b, k, i] ann['bbox'] = groundtruths['boxes'][b, k, i] if 'area' in groundtruths: ann['area'] = groundtruths['areas'][b, k, i] else: ann['area'] = (groundtruths['boxes'][b, k, i, 2] * groundtruths['boxes'][b, k, i, 3]) if 'masks' in groundtruths: ann['segmentation'] = encoded_mask[i] if 'area' not in groundtruths: ann['area'] = mask_utils.area(encoded_mask[i]) gt_annotations.append(ann) for i, ann in enumerate(gt_annotations): ann['id'] = i + 1 if label_map: gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map] else: category_ids = [gt['category_id'] for gt in gt_annotations] gt_categories = [{'id': i} for i in set(category_ids)] gt_dataset = { 'images': gt_images, 'categories': gt_categories, 'annotations': copy.deepcopy(gt_annotations), } return gt_dataset
def rle_mask_voting(top_masks, all_masks, all_dets, iou_thresh, binarize_thresh, method='AVG'): """Returns new masks (in correspondence with `top_masks`) by combining multiple overlapping masks coming from the pool of `all_masks`. Two methods for combining masks are supported: 'AVG' uses a weighted average of overlapping mask pixels; 'UNION' takes the union of all mask pixels. """ if len(top_masks) == 0: return all_not_crowd = [False] * len(all_masks) top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd) decoded_all_masks = [ np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks ] decoded_top_masks = [ np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks ] all_boxes = all_dets[:, :4].astype(np.int32) all_scores = all_dets[:, 4] # Fill box support with weights mask_shape = decoded_all_masks[0].shape mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1])) for k in range(len(all_masks)): ref_box = all_boxes[k] x_0 = max(ref_box[0], 0) x_1 = min(ref_box[2] + 1, mask_shape[1]) y_0 = max(ref_box[1], 0) y_1 = min(ref_box[3] + 1, mask_shape[0]) mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k] mask_weights = np.maximum(mask_weights, 1e-5) top_segms_out = [] for k in range(len(top_masks)): # Corner case of empty mask if decoded_top_masks[k].sum() == 0: top_segms_out.append(top_masks[k]) continue inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0] # Only matches itself if len(inds_to_vote) == 1: top_segms_out.append(top_masks[k]) continue masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote] if method == 'AVG': ws = mask_weights[inds_to_vote] soft_mask = np.average(masks_to_vote, axis=0, weights=ws) mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8) elif method == 'UNION': # Any pixel that's on joins the mask soft_mask = np.sum(masks_to_vote, axis=0) mask = np.array(soft_mask > 1e-5, dtype=np.uint8) else: raise NotImplementedError('Method {} is unknown'.format(method)) rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] top_segms_out.append(rle) return top_segms_out
def mask2out(results, clsid2catid, resolution, thresh_binarize=0.5): import pycocotools.mask as mask_util scale = (resolution + 2.0) / resolution segm_res = [] # for each batch for t in results: bboxes = t['bbox'][0] lengths = t['bbox'][1][0] im_ids = np.array(t['im_id'][0]) if bboxes.shape == (1, 1) or bboxes is None: continue if len(bboxes.tolist()) == 0: continue masks = t['mask'][0] s = 0 # for each sample for i in range(len(lengths)): num = lengths[i] im_id = int(im_ids[i][0]) im_shape = t['im_shape'][0][i] bbox = bboxes[s:s + num][:, 2:] clsid_scores = bboxes[s:s + num][:, 0:2] mask = masks[s:s + num] s += num im_h = int(im_shape[0]) im_w = int(im_shape[1]) expand_bbox = expand_boxes(bbox, scale) expand_bbox = expand_bbox.astype(np.int32) padded_mask = np.zeros( (resolution + 2, resolution + 2), dtype=np.float32) for j in range(num): xmin, ymin, xmax, ymax = expand_bbox[j].tolist() clsid, score = clsid_scores[j].tolist() clsid = int(clsid) padded_mask[1:-1, 1:-1] = mask[j, clsid, :, :] catid = clsid2catid[clsid] w = xmax - xmin + 1 h = ymax - ymin + 1 w = np.maximum(w, 1) h = np.maximum(h, 1) resized_mask = cv2.resize(padded_mask, (w, h)) resized_mask = np.array( resized_mask > thresh_binarize, dtype=np.uint8) im_mask = np.zeros((im_h, im_w), dtype=np.uint8) x0 = min(max(xmin, 0), im_w) x1 = min(max(xmax + 1, 0), im_w) y0 = min(max(ymin, 0), im_h) y1 = min(max(ymax + 1, 0), im_h) im_mask[y0:y1, x0:x1] = resized_mask[(y0 - ymin):(y1 - ymin), ( x0 - xmin):(x1 - xmin)] segm = mask_util.encode( np.array( im_mask[:, :, np.newaxis], order='F'))[0] catid = clsid2catid[clsid] segm['counts'] = segm['counts'].decode('utf8') coco_res = { 'image_id': im_id, 'category_id': catid, 'segmentation': segm, 'score': score } segm_res.append(coco_res) return segm_res
def convert_predictions_to_coco_annotations(predictions): """Convernt predictions to annotations in COCO format. Args: predictions: a dictionary of lists of numpy arrays including the following fields. See different parsers under `../dataloader` for more details. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - image_info: a list of numpy arrays of float of shape [batch_size, 4, 2]. - num_detections: a list of numpy arrays of int of shape [batch_size]. - detection_boxes: a list of numpy arrays of float of shape [batch_size, K, 4]. - detection_classes: a list of numpy arrays of int of shape [batch_size, K]. - detection_scores: a list of numpy arrays of float of shape [batch_size, K]. Optional fields: - detection_masks: a list of numpy arrays of float of shape [batch_size, K, mask_height, mask_width]. Returns: coco_predictions: prediction in COCO annotation format. """ for k in predictions: predictions[k] = np.stack(predictions[k], axis=0) num_batches = predictions['source_id'].shape[0] batch_size = predictions['source_id'].shape[1] max_num_detections = predictions['detection_classes'].shape[2] image_scale = np.tile(predictions['image_info'][:, :, 2:3, :], (1, 1, 1, 2)) predictions['detection_boxes'] = predictions['detection_boxes'] / image_scale boxes_ymin = predictions['detection_boxes'][:, :, :, 0] boxes_xmin = predictions['detection_boxes'][:, :, :, 1] boxes_width = (predictions['detection_boxes'][:, :, :, 3] - predictions['detection_boxes'][:, :, :, 1]) boxes_height = (predictions['detection_boxes'][:, :, :, 2] - predictions['detection_boxes'][:, :, :, 0]) predictions['detection_boxes'] = np.stack( [boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=3) coco_predictions = [] for b in range(num_batches): for k in range(batch_size): if 'detection_masks' in predictions: image_masks = predictions['detection_masks'][b, k] encoded_mask = [ mask_utils.encode(np.asfortranarray(image_mask.astype(np.uint8))) for image_mask in list(image_masks)] for i in range(max_num_detections): ann = {} ann['iscrowd'] = 0 ann['image_id'] = predictions['source_id'][b, k] ann['category_id'] = predictions['detection_classes'][b, k, i] ann['score'] = predictions['detection_scores'][b, k, i] ann['bbox'] = predictions['detection_boxes'][b, k, i] if 'detection_masks' in predictions: ann['segmentation'] = encoded_mask[i] coco_predictions.append(ann) for i, ann in enumerate(coco_predictions): ann['id'] = i + 1 return coco_predictions
def _ochuman_segm2json(self, results): """Convert instance segmentation results to COCO json style.""" bbox_json_results = [] segm_json_results = [] for idx in range(len(self)): img_id = self.img_ids[idx] det, seg = results[idx] for label in range(len(det)): # bbox results bboxes = det[label] for i in range(bboxes.shape[0]): data = dict() data['image_id'] = img_id data['bbox'] = self.xyxy2xywh(bboxes[i]) data['score'] = float(bboxes[i][4]) data['category_id'] = self.cat_ids[label] bbox_json_results.append(data) # segm results # some detectors use different scores for bbox and mask if isinstance(seg, tuple): segms = seg[0][label] mask_score = seg[1][label] else: segms = seg[label] mask_score = [bbox[4] for bbox in bboxes] for i in range(bboxes.shape[0]): data = dict() data['image_id'] = img_id data['bbox'] = self.xyxy2xywh(bboxes[i]) data['score'] = float(mask_score[i]) data['category_id'] = self.cat_ids[label] maskencode = maskUtils.encode(np.asfortranarray(segms[i])) maskencode['counts'] = maskencode['counts'].decode('ascii') data['segmentation'] = segms[i] segm_json_results.append(data) return bbox_json_results, segm_json_results def results2json(self, results, outfile_prefix): """Dump the detection results to a COCO style json file. There are 3 types of results: proposals, bbox predictions, mask predictions, and they have different data types. This method will automatically recognize the type, and dump them to json files. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. outfile_prefix (str): The filename prefix of the json files. If the prefix is "somepath/xxx", the json files will be named "somepath/xxx.bbox.json", "somepath/xxx.segm.json", "somepath/xxx.proposal.json". Returns: dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \ values are corresponding filenames. """ result_files = dict() if isinstance(results[0], list): json_results = self._det2json(results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' mmcv.dump(json_results, result_files['bbox']) elif isinstance(results[0], tuple): json_results = self._ochuman_segm2json(results) result_files['bbox'] = f'{outfile_prefix}.bbox.json' result_files['proposal'] = f'{outfile_prefix}.bbox.json' result_files['segm'] = f'{outfile_prefix}.segm.json' mmcv.dump(json_results[0], result_files['bbox']) mmcv.dump(json_results[1], result_files['segm']) elif isinstance(results[0], np.ndarray): json_results = self._proposal2json(results) result_files['proposal'] = f'{outfile_prefix}.proposal.json' mmcv.dump(json_results, result_files['proposal']) else: raise TypeError('invalid type of results') return result_files
def instances_to_coco_json(instances, img_id): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() attr_classes = instances.attr_classes.tolist() attr_scores = instances.attr_scores.tolist() #print (len(scores), len(attr_scores), len(attr_classes)) has_mask = instances.has("pred_masks") if has_mask: # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset rles = [ mask_util.encode( np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], "attributes": attr_classes[k], "attr_scores": attr_scores[k], } if has_mask: result["segmentation"] = rles[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results
def eval_sequence(self, data): """Returns J&F metrics for one sequence""" # Only loaded when run to reduce minimum requirements from pycocotools import mask as mask_utils num_timesteps = data['num_timesteps'] num_tracker_ids = data['num_tracker_ids'] num_gt_ids = data['num_gt_ids'] gt_dets = data['gt_dets'] tracker_dets = data['tracker_dets'] gt_ids = data['gt_ids'] tracker_ids = data['tracker_ids'] # get shape of frames frame_shape = None if num_gt_ids > 0: for t in range(num_timesteps): if len(gt_ids[t]) > 0: frame_shape = gt_dets[t][0]['size'] break elif num_tracker_ids > 0: for t in range(num_timesteps): if len(tracker_ids[t]) > 0: frame_shape = tracker_dets[t][0]['size'] break if frame_shape: # append all zero masks for timesteps in which tracks do not have a detection zero_padding = np.zeros((frame_shape), order='F').astype(np.uint8) padding_mask = mask_utils.encode(zero_padding) for t in range(num_timesteps): gt_id_det_mapping = { gt_ids[t][i]: gt_dets[t][i] for i in range(len(gt_ids[t])) } gt_dets[t] = [ gt_id_det_mapping[index] if index in gt_ids[t] else padding_mask for index in range(num_gt_ids) ] tracker_id_det_mapping = { tracker_ids[t][i]: tracker_dets[t][i] for i in range(len(tracker_ids[t])) } tracker_dets[t] = [ tracker_id_det_mapping[index] if index in tracker_ids[t] else padding_mask for index in range(num_tracker_ids) ] # also perform zero padding if number of tracker IDs < number of ground truth IDs if num_tracker_ids < num_gt_ids: diff = num_gt_ids - num_tracker_ids for t in range(num_timesteps): tracker_dets[t] = tracker_dets[t] + [ padding_mask for _ in range(diff) ] num_tracker_ids += diff j = self._compute_j(gt_dets, tracker_dets, num_gt_ids, num_tracker_ids, num_timesteps) # boundary threshold for F computation bound_th = 0.008 # perform matching if self.optim_type == 'J&F': f = np.zeros_like(j) for k in range(num_tracker_ids): for i in range(num_gt_ids): f[k, i, :] = self._compute_f(gt_dets, tracker_dets, k, i, bound_th) optim_metrics = (np.mean(j, axis=2) + np.mean(f, axis=2)) / 2 row_ind, col_ind = linear_sum_assignment(-optim_metrics) j_m = j[row_ind, col_ind, :] f_m = f[row_ind, col_ind, :] elif self.optim_type == 'J': optim_metrics = np.mean(j, axis=2) row_ind, col_ind = linear_sum_assignment(-optim_metrics) j_m = j[row_ind, col_ind, :] f_m = np.zeros_like(j_m) for i, (tr_ind, gt_ind) in enumerate(zip(row_ind, col_ind)): f_m[i] = self._compute_f(gt_dets, tracker_dets, tr_ind, gt_ind, bound_th) else: raise TrackEvalException( 'Unsupported optimization type %s for J&F metric.' % self.optim_type) # append zeros for false negatives if j_m.shape[0] < data['num_gt_ids']: diff = data['num_gt_ids'] - j_m.shape[0] j_m = np.concatenate((j_m, np.zeros((diff, j_m.shape[1]))), axis=0) f_m = np.concatenate((f_m, np.zeros((diff, f_m.shape[1]))), axis=0) # compute the metrics for each ground truth track res = { 'J-Mean': [np.nanmean(j_m[i, :]) for i in range(j_m.shape[0])], 'J-Recall': [ np.nanmean(j_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(j_m.shape[0]) ], 'F-Mean': [np.nanmean(f_m[i, :]) for i in range(f_m.shape[0])], 'F-Recall': [ np.nanmean(f_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(f_m.shape[0]) ], 'J-Decay': [], 'F-Decay': [] } n_bins = 4 ids = np.round( np.linspace(1, data['num_timesteps'], n_bins + 1) + 1e-10) - 1 ids = ids.astype(np.uint8) for k in range(j_m.shape[0]): d_bins_j = [ j_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins) ] res['J-Decay'].append( np.nanmean(d_bins_j[0]) - np.nanmean(d_bins_j[3])) for k in range(f_m.shape[0]): d_bins_f = [ f_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins) ] res['F-Decay'].append( np.nanmean(d_bins_f[0]) - np.nanmean(d_bins_f[3])) # count number of tracks for weighting of the result res['num_gt_tracks'] = len(res['J-Mean']) for field in [ 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay' ]: res[field] = np.mean(res[field]) res['J&F'] = (res['J-Mean'] + res['F-Mean']) / 2 return res
def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg, ori_shape, scale_factor, rescale): """Get segmentation masks from mask_pred and bboxes. Args: mask_pred (Tensor or ndarray): shape (n, #class+1, h, w). For single-scale testing, mask_pred is the direct output of model, whose type is Tensor, while for multi-scale testing, it will be converted to numpy array outside of this method. det_bboxes (Tensor): shape (n, 4/5) det_labels (Tensor): shape (n, ) img_shape (Tensor): shape (3, ) rcnn_test_cfg (dict): rcnn testing config ori_shape: original image size Returns: list[list]: encoded masks """ if isinstance(mask_pred, torch.Tensor): mask_pred = mask_pred.sigmoid().cpu().numpy() assert isinstance(mask_pred, np.ndarray) cls_segms = [[] for _ in range(self.num_classes - 1)] bboxes = det_bboxes.cpu().numpy()[:, :4] labels = det_labels.cpu().numpy() + 1 if rescale: # keep the img size as the ori size. img_h, img_w = ori_shape[:2] else: # if not rescale, that means the output bboxes fit to # the size of input images. img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32) img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32) scale_factor = 1.0 for i in range(bboxes.shape[0]): # the bbox are are fit to the rescaled image. bbox = (bboxes[i, :] / scale_factor).astype(np.int32) label = labels[i] if not self.class_agnostic: mask_pred_ = mask_pred[i, label, :, :] else: mask_pred_ = mask_pred[i, 0, :, :] # here to add expand mask and expand bbox. paded_mask_, scale = self.expand_mask(mask_pred_) bbox = self.expand_bbox(bbox, scale) w = max(bbox[2] - bbox[0] + 1, 1) h = max(bbox[3] - bbox[1] + 1, 1) im_mask = np.zeros((img_h, img_w), dtype=np.uint8) bbox_mask = mmcv.imresize(paded_mask_, (w, h)) # expand bbox before. bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype( np.uint8) # add according to maskrcnn benchmark x0 = max(bbox[0], 0) x1 = min(bbox[2] + 1, img_w) y0 = max(bbox[1], 0) y1 = min(bbox[3] + 1, img_h) # im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask[] im_mask[y0:y1, x0:x1] = bbox_mask[(y0 - bbox[1]):(y1 - bbox[1]), (x0 - bbox[0]):(x1 - bbox[0])] rle = mask_util.encode( np.array(im_mask[:, :, np.newaxis], order='F'))[0] cls_segms[label - 1].append(rle) return cls_segms
def create_coco_style(self, input_path, des): maxInt = sys.maxsize while True: try: csv.field_size_limit(maxInt) break except: maxInt = int(maxInt / 10) data_path = input_path + 'train.csv' # json_des = '/kaggle/working/label_descriptions.json' # with open(json_des, 'r') as f: # des = json.load(f) info = des['info'] categories = des['categories'] attributes = des['attributes'] #f = open(data_path, 'r') # X = pd.read_csv(data_path) X_train, X_test = train_test_split(X, test_size=0.2) # X_train = X X_dtrain = X_train.to_dict('records', into=OrderedDict) X_dtest = X_test.to_dict('records', into=OrderedDict) #reader = csv.DictReader(f)#, fieldnames=('imageid', 'height', 'width', 'encodedpixels', 'classid')) rows_train = [] rows_test = [] myorder = ['ImageId', 'Height', 'Width', 'EncodedPixels', 'ClassId'] image_id = 1 segmentation_id = 1 coco_output = { "info": info, "licenses": "", "categories": categories, "images": [], "annotations": [] } coco_output_test = coco_output with open('{}train.txt'.format(input_path), 'w') as output_text_file: for row in X_dtrain: # Write training text output_text_file.write('{} '.format(row['ImageId'])) # Ordered Dict ordered = OrderedDict((k, row[k]) for k in myorder) # ordered['EncodedPixels'] = list(map(int, ordered['EncodedPixels'].split(' '))) if len(ordered['ClassId']) > 2: classes = [list(map(int, ordered['ClassId'].split('_')))] ordered['ClassId'] = classes[0][0] else: ordered['ClassId'] = int(ordered['ClassId']) # COCO image_info = pycoco.create_image_info( image_id, input_path + row['ImageId'], (row['Width'], row['Height'])) coco_output["images"].append(image_info) rle, binary_mask = rle_decode(row['EncodedPixels'], (row['Height'], row['Width'])) fortran_binary_mask = np.asfortranarray( binary_mask.astype(np.uint8)) binary_mask_encoded = mask.encode(fortran_binary_mask) # rle2 = pycoco.binary_mask_to_rle(fortran_binary_mask) area = mask.area(binary_mask_encoded) bounding_box = mask.toBbox(binary_mask_encoded) annotation_info = { "id": segmentation_id, "image_id": image_id, "category_id": ordered['ClassId'], "iscrowd": 1, "area": area.tolist(), "bbox": bounding_box.tolist(), "segmentation": rle, "width": row['Width'], "height": row['Height'], } coco_output["annotations"].append(annotation_info) segmentation_id += 1 image_id += 1 rows_train.append(ordered) with open('{}train.json'.format(input_path), 'w') as output_json_file: json.dump(coco_output, output_json_file) with open('{}test.txt'.format(input_path), 'w') as output_text_file: for row in X_dtest: # Write test text output_text_file.write('{} '.format(row['ImageId'])) # Ordered Dict ordered = OrderedDict((k, row[k]) for k in myorder) # ordered['EncodedPixels'] = list(map(int, ordered['EncodedPixels'].split(' '))) if len(ordered['ClassId']) > 2: classes = [list(map(int, ordered['ClassId'].split('_')))] ordered['ClassId'] = classes[0][0] else: ordered['ClassId'] = int(ordered['ClassId']) # COCO image_info = pycoco.create_image_info( image_id, input_path + row['ImageId'], (row['Width'], row['Height'])) coco_output_test["images"].append(image_info) rle, binary_mask = rle_decode(row['EncodedPixels'], (row['Height'], row['Width'])) fortran_binary_mask = np.asfortranarray( binary_mask.astype(np.uint8)) binary_mask_encoded = mask.encode(fortran_binary_mask) # rle2 = pycoco.binary_mask_to_rle(fortran_binary_mask) area = mask.area(binary_mask_encoded) bounding_box = mask.toBbox(binary_mask_encoded) annotation_info = { "id": segmentation_id, "image_id": image_id, "category_id": ordered['ClassId'], "iscrowd": 1, "area": area.tolist(), "bbox": bounding_box.tolist(), "segmentation": rle, "width": row['Width'], "height": row['Height'], } coco_output_test["annotations"].append(annotation_info) segmentation_id += 1 image_id += 1 rows_train.append(ordered) with open('{}test.json'.format(input_path), 'w') as output_json_file: json.dump(coco_output_test, output_json_file)
# save as json pred = [] for j, (box, prob, label) in enumerate( zip(final_boxes, final_probs, final_labels)): box[2] -= box[0] box[3] -= box[1] # produce x,y,w,h output cat_id = label cat_name = targetid2class[cat_id] # encode mask rle = None if args.add_mask: final_mask = final_masks[j] # [14, 14] rle = cocomask.encode( np.array(final_mask[:, :, None], order="F"))[0] rle['counts'] = rle['counts'].decode("ascii") res = { "category_id": cat_id, "cat_name": cat_name, #[0-80] "score": float(round(prob, 7)), "bbox": list(map(lambda x: float(round(x, 2)), box)), "segmentation": rle, } pred.append(res) #predfile = os.path.join(args.out_dir, "%s_F_%08d.json"%(videoname, cur_frame)) if args.use_my_naming: predfile = os.path.join(
# person is first class in COCO # so extract person bbox and mask from the entire result result_person_only = ([result[0][0]], [result[1][0]]) result_bbox = [result[0][0]] result_mask = [result[1][0]] # gt.txt structure # clsid = 1 we set the clsid to objid for easier labeling # frameid, objid, clsid, img_h, img_w, rle_code start = 0 pairs = sorted(zip(result_bbox[0], result_mask[0]), key=lambda x: x[0][0]) for bbox, mask in pairs: if bbox[-1] >= 0.9: rle_code = cocomask.encode(np.asfortranarray(mask)) f.write('{} {} {} {} {} {}\n'.format( i + 1, start, 1, 720, 1280, rle_code['counts'].decode('ascii'))) start += 1 model.show_result(frame, result, score_thr=1.1, out_file='../msc/cam4/img/{:06d}.jpg'.format(i + 1)) model.show_result(frame, result_person_only, score_thr=0.9, bbox_color='red', text_color='red', out_file='../msc/cam4/det/{:06d}.jpg'.format(i + 1))
def prepare_for_coco_segmentation(predictions, dataset): import pycocotools.mask as mask_util import numpy as np masker = Masker(threshold=0.5, padding=1) # assert isinstance(dataset, COCODataset) coco_results = [] # 通过图片id进行遍历 for image_id, prediction in tqdm(enumerate(predictions)): original_id = dataset.id_to_img_map[image_id] if len(prediction) == 0: continue # 获得原始的图片信息(宽高) img_info = dataset.get_img_info(image_id) image_width = img_info["width"] image_height = img_info["height"] prediction = prediction.resize((image_width, image_height)) masks = prediction.get_field("mask") # t = time.time() # Masker is necessary only if masks haven't been already resized. if list(masks.shape[-2:]) != [image_height, image_width]: masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) masks = masks[0] # logger.info('Time mask: {}'.format(time.time() - t)) # prediction = prediction.convert('xywh') # boxes = prediction.bbox.tolist() scores = prediction.get_field("scores").tolist() labels = prediction.get_field("labels").tolist() # 获取零件类别 component_scores = prediction.get_field("component_scores").tolist() components = prediction.get_field("components").tolist() # rles = prediction.get_field('mask') rles = [ mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") mapped_labels = [ dataset.contiguous_category_id_to_json_id[i] for i in labels ] # 预测值和类别id之间的映射关系 mapped_components = [ dataset.contiguous_component_id_to_json_id[i] for i in components ] coco_results.extend([{ "image_id": original_id, "category_id": mapped_labels[k], "component_id": mapped_components[k], "segmentation": rle, "score": scores[k] * component_scores[k], "component_score": component_scores[k], } for k, rle in enumerate(rles)]) return coco_results
def rle_from_binary(prediction): prediction = np.asfortranarray(prediction) return cocomask.encode(prediction)
def maskify( im, crop_box, threshold, positive_histogram, negative_histogram, threshold ): """ For each annotation, create a COCO formated segmentation Arguments: - im: The input image - crop_box: Tuple of coordinates isolating object of interest - threshold: Percentage value representing liklihood of a pixel belonging to the positive histogram class - positive_histogram: Histogram representing pixels pertaining to an object - negative_histogram: Histogram representing non-example pixels pertaining to an object Return: Array of COCO styled segmentation annotations """ # Get the size of the image original_rows, original_cols = im.size # Crop the image around the bounding box im = im.crop(crop_box) # Load pixel RGB data pix = im.load() # Get row and cols of cropped image cols, rows = im.size # Convert cropped image to numpy array im = np.array(im) # Get the height and width of the cropped image rows = np.shape(im)[0] cols = np.shape(im)[1] # Get histogram bins histogram_bins = np.shape(positive_histogram)[0] # Get the factor based on the histogram bins. Used to index into to the histogram. factor = 256 / histogram_bins # Declare a results numpy array that contains only zeros result = np.zeros((rows, cols)) # Determine the probability of water given RGB and histograms representing water and non water for row in range(rows): for col in range(cols): # Get each RGB value red = float(pix[col, row][0]) green = float(pix[col, row][1]) blue = float(pix[col, row][2]) # Get the index into histograms based on RGB value and histogram factor size (declared above) red_index = floor(red / factor) green_index = floor(green / factor) blue_index = floor(blue / factor) # Get positive and negative values from histograms positive = positive_histogram[red_index, green_index, blue_index] negative_value = negative_histogram[red_index, green_index, blue_index] total = positive + negative if total is not 0: result[row, col] = water_value / total # Set threshold equal to the median value of the resulting numpy array if threshold = np.median(result) if threshold is 'auto' else threshold # The intuition here is that if our threshold is equal to the median value of the resulting # array, then there will be a largest connected component. Any other value, and we're risking # the possibility of no largest connected component existing, which is a potential error that we # have to account for. if threshold != np.median(result): result_backup = np.copy(result) # Parse values of result given threshold for row in range(rows): for col in range(cols): if result[row, col] < threshold: result[row, col] = 1 else: result[row, col] = 0 # Retry if all values in result are 0 (ie - no largest connected component) if np.sum(result) == 0: result = result_backup for row in range(rows): for col in range(cols): if result[row, col] < np.median(result): result[row, col] = 1 else: result[row, col] = 0 # Get the largest connected component labels = label(result) assert( labels.max() != 0 ) # assume at least 1 CC largestCC = labels == np.argmax(np.bincount(labels.flat)[1:])+1 # Fill holes in the boat largestCC = binary_fill_holes(largestCC) # Dialate to expand the mask largestCC = binary_dilation(largestCC, iterations=4) plt.imshow(largestCC) # Create numpy zeros array the same size as the original image before cropping image_with_mask = np.zeros((original_cols, original_rows)) # Overlay binary mask onto zeros array image_with_mask[crop_box[1]:crop_box[1] + rows, crop_box[0]:crop_box[0] + cols] = largestCC """ Convert the binary mask to COCO JSON format. Code referenced from: - https://github.com/cocodataset/cocoapi/issues/131#issuecomment-371250565 """ image_with_mask = np.array(image_with_mask, dtype=np.uint8) fortran_ground_truth_binary_mask = np.asfortranarray(image_with_mask) encoded_ground_truth = mask.encode(fortran_ground_truth_binary_mask) ground_truth_area = mask.area(encoded_ground_truth) ground_truth_bounding_box = mask.toBbox(encoded_ground_truth) contours = measure.find_contours(image_with_mask, 0.5) segmentations = [] for contour in contours: contour = np.flip(contour, axis=1) segmentation = contour.ravel().tolist() segmentations.append(segmentation) return segmentations
def evaluate_coco(generator, model, threshold=0.05): # start collecting results results = [] image_ids = [] for index in range(generator.size()): image = generator.load_image(index) image_shape = image.shape image = generator.preprocess_image(image) image, scale = generator.resize_image(image) # run network outputs = model.predict_on_batch(np.expand_dims(image, axis=0)) boxes = outputs[-4] scores = outputs[-3] labels = outputs[-2] masks = outputs[-1] # correct boxes for image scale boxes /= scale # change to (x, y, w, h) (MS COCO standard) boxes[..., 2] -= boxes[..., 0] boxes[..., 3] -= boxes[..., 1] # compute predicted labels and scores for box, score, label, mask in zip(boxes[0], scores[0], labels[0], masks[0]): # scores are sorted by the network if score < threshold: break b = box.astype(int) # box (x, y, w, h) as one int vector mask = cv2.resize(mask[:, :, label], (b[2], b[3])) mask = (mask > 0.5).astype(np.uint8) # binarize for encoding as RLE segmentation = np.zeros((image_shape[0], image_shape[1]), dtype=np.uint8) segmentation[b[1]:b[1] + b[3], b[0]:b[0] + b[2]] = mask segmentation = mask_utils.encode(np.asfortranarray(segmentation)) # append boxes for each positively labeled class image_result = { 'image_id' : generator.image_ids[index], 'category_id' : generator.label_to_coco_label(label), 'score' : float(score), 'bbox' : box.tolist(), 'segmentation': segmentation } # convert byte to str to write in json (in Python 3) if not isinstance(image_result['segmentation']['counts'], str): image_result['segmentation']['counts'] = image_result['segmentation']['counts'].decode() # append detection to results results.append(image_result) # append image to list of processed images image_ids.append(generator.image_ids[index]) # print progress print('{}/{}'.format(index, generator.size()), end='\r') if not len(results): return # write output json.dump(results, open('{}_segm_results.json'.format(generator.set_name), 'w'), indent=4) json.dump(image_ids, open('{}_processed_image_ids.json'.format(generator.set_name), 'w'), indent=4) # load results in COCO evaluation tool coco_true = generator.coco coco_pred = coco_true.loadRes('{}_segm_results.json'.format(generator.set_name)) # run COCO evaluation coco_eval = COCOeval(coco_true, coco_pred, 'segm') coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize()
def bbox_merge(dets, segs, iou_thr, scr_thr, mask_thr): # dets: [[x1, y1, x2, y2, score], ... ] if dets.shape[0] <= 1: return dets, segs order = dets[:, -1].ravel().argsort()[::-1] dets = dets[order, :] scr_keep_inds = (np.where(dets[:, -1] > scr_thr))[0] dets = dets[scr_keep_inds, :] segs = [segs[ind] for ind in scr_keep_inds] dets_res = np.zeros([0, 5]) segs_res = [] imgHeight, imgWidth = 1024, 2048 while dets.shape[0] > 0: num = dets.shape[0] # IoU area = (dets[:, 2] - dets[:, 0] + 1) * (dets[:, 3] - dets[:, 1] + 1) xx1 = np.maximum(dets[0, 0], dets[:, 0]) yy1 = np.maximum(dets[0, 1], dets[:, 1]) xx2 = np.minimum(dets[0, 2], dets[:, 2]) yy2 = np.minimum(dets[0, 3], dets[:, 3]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h o = inter / (area[0] + area[:] - inter) # get needed merge det and delete these det merge_inds = np.where(o >= iou_thr)[0] dets_to_merge = dets[merge_inds, :] segs_to_merge = [segs[ind] for ind in merge_inds] dets = np.delete(dets, merge_inds, 0) # remained dets and segs after remerge. segs = [segs[i] for i in range(num) if i not in merge_inds] if merge_inds.shape[0] <= 1: dets_res = np.row_stack((dets_res, dets_to_merge)) segs_res += segs_to_merge else: scores = dets_to_merge[:, -1:] dets_to_merge[:, :-1] = dets_to_merge[:, :-1] * np.tile( scores, (1, 4)) max_score = np.max(scores) det_merged = np.zeros((1, 5)) det_merged[:, :-1] = np.sum(dets_to_merge[:, :-1], axis=0) / np.sum(scores) det_merged[:, -1] = max_score dets_res = np.row_stack((dets_res, det_merged)) img = np.zeros((imgHeight, imgWidth)) for i in range(merge_inds.shape[0]): mask = maskUtils.decode(segs_to_merge[i]).astype(np.bool) img[mask] += scores[i, -1] img = img / np.max(img) img[img >= mask_thr] = 1 img[img < mask_thr] = 0 img = img.astype(np.uint8) # print(img.shape) seg_merged = maskUtils.encode( np.array(img[:, :, np.newaxis], order='F'))[0] segs_res.append(seg_merged) return dets_res, segs_res
def _forward(self, step_num): video_tag = self.val_data.get_video_tag() time_step_id = self.val_data.get_object_idx_in_video() img_filename = self.val_data._curr_video_data[time_step_id][0][ DataKeys.IMAGE_FILENAMES] timestep_name = img_filename.split('/')[-1].replace('.jpg', '') if self.print_per_object_stats: print("forwarding on", video_tag + ":" + str(time_step_id), "after step", step_num, "proposals:", len(self.val_data._curr_video_data[time_step_id])) measures = {} # Get proposals: proposals_dir = self.config.string("bb_input_dir", None) output_dir = self.config.string("output_dir", None) curr = video_tag + timestep_name.zfill(5) + ".json" in_dir = proposals_dir + curr out_dir = output_dir + curr with open(in_dir, "r") as f: proposals = json.load(f) for idx in range(self.val_data.n_examples_per_epoch()): feed_dict = self.val_data.get_feed_dict_for_next_step() # step_res = self.trainer.validation_step(feed_dict=feed_dict, extraction_keys=[ # Extractions.SEGMENTATION_POSTERIORS_ORIGINAL_SIZE, Extractions.SEGMENTATION_MASK_ORIGINAL_SIZE, # DataKeys.IMAGE_FILENAMES, DataKeys.RAW_IMAGES, DataKeys.OBJ_TAGS]) step_res = self.trainer.validation_step( feed_dict=feed_dict, extraction_keys=[ Extractions.SEGMENTATION_POSTERIORS_ORIGINAL_SIZE, Extractions.SEGMENTATION_MASK_ORIGINAL_SIZE, DataKeys.OBJ_TAGS ]) extractions = step_res[Extractions.EXTRACTIONS] step_measures = step_res[Measures.MEASURES] accumulate_measures(measures, step_measures) def extract(key): if key not in extractions: return None val = extractions[key] # for now assume we only use 1 gpu for forwarding assert len(val) == 1, len(val) val = val[0] # # for now assume, we use a batch size of 1 for forwarding assert val.shape[0] == 1, val.shape[0] val = val[0] return val predicted_segmentation = extract( Extractions.SEGMENTATION_MASK_ORIGINAL_SIZE) obj_tag = extract(DataKeys.OBJ_TAGS) posteriors = extract( Extractions.SEGMENTATION_POSTERIORS_ORIGINAL_SIZE) # img_filename = extract(DataKeys.IMAGE_FILENAMES) # img = extract(DataKeys.RAW_IMAGES) ########### New code for saving json directly # Insert mask into proposals obj_tag = int(obj_tag.decode('utf-8')) mask = predicted_segmentation.astype("uint8") * 255 encoded_mask = encode(np.asfortranarray(mask)) encoded_mask['counts'] = encoded_mask['counts'].decode("utf-8") proposals[obj_tag]["segmentation"] = encoded_mask conf_scores = posteriors.copy() conf_scores[predicted_segmentation == 0] = 1 - posteriors[predicted_segmentation == 0] conf_scores = 2 * conf_scores - 1 conf_score = conf_scores[:].mean() proposals[obj_tag]["conf_score"] = str(conf_score) create_out_dir = '/'.join(out_dir.split('/')[:-1]) if not os.path.exists(create_out_dir): os.makedirs(create_out_dir) with open(out_dir, 'w') as f: json.dump(proposals, f)
def add_data_to_coco(self, mode, data_path, category_number): if mode =='train': coco_dict = self.train_dict coco_images_path = self.coco_train_path coco_json_path = self.train_json_path elif mode =='val': coco_dict = self.val_dict coco_images_path = self.coco_val_path coco_json_path = self.val_json_path else: raise NotImplementedError images_path = os.path.join(data_path ,'processed', 'images') masks_path = os.path.join(data_path ,'processed', 'image_masks') # TODO yaml_path = os.path.join(data_path ,'processed', 'door_lever_3_keypoint.yaml') with open(yaml_path, 'r') as f: dataset_yaml_map = yaml.load(f.read()) id_index = self.get_dataset_number(mode) train_mode = 'Door_ ' +mode for key in dataset_yaml_map.keys(): # TODO origin_file_path = os.path.join(images_path, dataset_yaml_map[key]['rgb_image_filename']) target_file_name = train_mode + '_%06d.png ' %id_index target_file_path = os.path.join(coco_images_path, target_file_name) shutil.copyfile(origin_file_path ,target_file_path ) img_dict ={'license': 3, 'file_name': target_file_name, 'coco_url': '', 'height': 480, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': '', 'id': id_index} x, y = dataset_yaml_map[key]['bbox_top_left_xy'] x2, y2 = dataset_yaml_map[key]['bbox_bottom_right_xy'] w = x2 - x h = y2 - y area = float(w * h) img_number = int(dataset_yaml_map[key]['rgb_image_filename'].split('_')[0]) mask_file_path = os.path.join(masks_path, "%06d_mask.png" % img_number) # NOTE !!! UNIT16 or UNIT8 # ground_truth_binary_mask = cv2.convertScaleAbs(cv2.imread(mask_file_path, cv2.IMREAD_UNCHANGED)) ground_truth_binary_mask = cv2.imread(mask_file_path, cv2.IMREAD_UNCHANGED) # plt.imshow(ground_truth_binary_mask) # plt.colorbar() fortran_ground_truth_binary_mask = np.asfortranarray(ground_truth_binary_mask) encoded_ground_truth = mask.encode(fortran_ground_truth_binary_mask) ground_truth_area = mask.area(encoded_ground_truth) ground_truth_bounding_box = mask.toBbox(encoded_ground_truth) contours = measure.find_contours(ground_truth_binary_mask, 0.5) annot_dict = {'segmentation': [], 'area': ground_truth_area.tolist(), 'iscrowd': 0, 'image_id': id_index, 'bbox': [x, y, w, h], 'category_id': category_number, 'id': id_index} for contour in contours: contour = np.flip(contour, axis=1) segmentation = contour.ravel().tolist() annot_dict["segmentation"].append(segmentation) coco_dict['images'].append(img_dict) coco_dict['annotations'].append(annot_dict) id_index += 1 with open(coco_json_path, "w") as f: json.dump(coco_dict, f)
def postprocess_ytbvis(det_output, img_meta, interpolation_mode='bilinear', display_mask=False, visualize_lincomb=False, crop_masks=True, score_threshold=0, img_ids=None, mask_det_file=None): """ Postprocesses the output of Yolact on testing mode into a format that makes sense, accounting for all the possible configuration settings. Args: - det_output: The lost of dicts that Detect outputs. - w: The real with of the image. - h: The real height of the image. - batch_idx: If you have multiple images for this batch, the image's index in the batch. - interpolation_mode: Can be 'nearest' | 'area' | 'bilinear' (see torch.nn.functional.interpolate) Returns 4 torch Tensors (in the following order): - classes [num_det]: The class idx for each detection. - scores [num_det]: The confidence score for each detection. - boxes [num_det, 4]: The bounding box for each detection in absolute point form. - masks [num_det, h, w]: Full image masks for each detection. """ net = det_output['net'] detection = det_output['detection'] dets = {} for k, v in detection.items(): dets[k] = v.clone() ori_h, ori_w = img_meta['ori_shape'][:2] img_h, img_w = img_meta['img_shape'][:2] pad_h, pad_w = img_meta['pad_shape'][:2] s_w, s_h = (img_w / pad_w, img_h / pad_h) if dets['box'].nelement() == 0: dets['segm'] = torch.Tensor() return dets # double check if score_threshold > 0: keep = dets['score'] > score_threshold for k in dets: if k not in {'proto', 'bbox_idx', 'priors', 'embed_vectors', 'box_shift'} and dets[k] is not None: dets[k] = dets[k][keep] # Undo the padding introduced with preserve_aspect_ratio if cfg.preserve_aspect_ratio and dets['score'].nelement() != 0: # Get rid of any detections whose centers are outside the image boxes = dets['box'] boxes = center_size(boxes) not_outside = ((boxes[:, 0] > s_w) + (boxes[:, 1] > s_h)) < 1 # not (a or b) for k in dets: if k not in {'proto', 'bbox_idx', 'priors', 'embed_vectors', 'box_shift'} and dets[k] is not None: dets[k] = dets[k][not_outside] if dets['score'].size(0) == 0: dets['segm'] = torch.Tensor() return dets # Actually extract everything from dets now boxes = dets['box'] masks_coeff = dets['mask_coeff'] masks = dets['mask'] proto_data = dets['proto'] # normlized_coeff = F.normalize(masks_coeff, dim=1) # sim = torch.mm(normlized_coeff, normlized_coeff.t()) if visualize_lincomb: display_lincomb(proto_data, masks_coeff, img_ids, mask_det_file) # Undo padding for masks masks = masks[:, :int(s_h*masks.size(1)), :int(s_w*masks.size(2))] # Scale masks up to the full image if cfg.preserve_aspect_ratio: masks = F.interpolate(masks.unsqueeze(0), (ori_h, ori_w), mode=interpolation_mode, align_corners=False).squeeze(0) else: masks = F.interpolate(masks.unsqueeze(0), (img_h, img_w), mode=interpolation_mode, align_corners=False).squeeze(0) # Binarize the masks masks.gt_(0.5) if display_mask: dets['segm'] = masks else: # segm annotation: png2rle masks_output_json = [] for i in range(masks.size(0)): cur_mask = mask_util.encode(np.array(masks[i].cpu(), order='F', dtype='uint8')) # masks[i, :, :] = torch.from_numpy(mask_util.decode(cur_mask)).cuda() masks_output_json.append(cur_mask) dets['segm'] = masks_output_json # Undo padding for bboxes boxes[:, 0::2] = boxes[:, 0::2] / s_w boxes[:, 1::2] = boxes[:, 1::2] / s_h # priors = dets['priors'] # [cx, cy, w, h] # priors[:, :2] = priors[:, :2] - priors[:, 2:]/2 # priors[:, 2:] = priors[:, :2] + priors[:, 2:] # priors[:, 0::2] = priors[:, 0::2] / s_w # priors[:, 1::2] = priors[:, 1::2] / s_h if cfg.preserve_aspect_ratio: out_w = ori_w out_h = ori_h else: out_w = img_w out_h = img_h boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], out_w, cast=False) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], out_h, cast=False) # priors[:, 0], priors[:, 2] = sanitize_coordinates(priors[:, 0], priors[:, 2], out_w, cast=False) # priors[:, 1], priors[:, 3] = sanitize_coordinates(priors[:, 1], priors[:, 3], out_h, cast=False) boxes = boxes.long() dets['box'] = boxes # dets['priors'] = priors.long() return dets
def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): """Converts groundtruths to the dataset in COCO format. Args: groundtruths: a dictionary of numpy arrays including the fields below. Note that each element in the list represent the number for a single example without batch dimension. K below denotes the actual number of instances for each image. Required fields: - source_id: a list of numpy arrays of int or string of shape [batch_size]. - height: a list of numpy arrays of int of shape [batch_size]. - width: a list of numpy arrays of int of shape [batch_size]. - num_detections: a list of numpy arrays of int of shape [batch_size]. - boxes: a list of numpy arrays of float of shape [batch_size, K, 4], where coordinates are in the original image space (not the normalized coordinates). - classes: a list of numpy arrays of int of shape [batch_size, K]. Optional fields: - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If th field is absent, it is assumed that this instance is not crowd. - areas: a list of numy arrays of float of shape [batch_size, K]. If the field is absent, the area is calculated using either boxes or masks depending on which one is available. - masks: a list of numpy arrays of string of shape [batch_size, K], label_map: (optional) a dictionary that defines items from the category id to the category name. If `None`, collect the category mappping from the `groundtruths`. Returns: coco_groundtruths: the groundtruth dataset in COCO format. """ source_ids = np.concatenate(groundtruths['source_id'], axis=0) heights = np.concatenate(groundtruths['height'], axis=0) widths = np.concatenate(groundtruths['width'], axis=0) gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w in zip(source_ids, heights, widths)] gt_annotations = [] num_batches = len(groundtruths['source_id']) batch_size = groundtruths['source_id'][0].shape[0] for i in range(num_batches): for j in range(batch_size): num_instances = groundtruths['num_detections'][i][j] for k in range(num_instances): ann = {} ann['image_id'] = int(groundtruths['source_id'][i][j]) if 'is_crowds' in groundtruths: ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k]) else: ann['iscrowd'] = 0 ann['category_id'] = int(groundtruths['classes'][i][j, k]) boxes = groundtruths['boxes'][i] ann['bbox'] = [float(boxes[j, k, 1]), float(boxes[j, k, 0]), float(boxes[j, k, 3] - boxes[j, k, 1]), float(boxes[j, k, 2] - boxes[j, k, 0])] if 'areas' in groundtruths: ann['area'] = float(groundtruths['areas'][i][j, k]) else: ann['area'] = float((boxes[j, k, 3] - boxes[j, k, 1]) * (boxes[j, k, 2] - boxes[j, k, 0])) if 'masks' in groundtruths: mask = Image.open(io.BytesIO(groundtruths['masks'][i][j, k])) width, height = mask.size np_mask = (np.array(mask.getdata()).reshape(height, width).astype(np.uint8)) np_mask[np_mask > 0] = 255 encoded_mask = mask_api.encode(np.asfortranarray(np_mask)) ann['segmentation'] = encoded_mask if 'areas' not in groundtruths: ann['area'] = mask_api.area(encoded_mask) gt_annotations.append(ann) for i, ann in enumerate(gt_annotations): ann['id'] = i + 1 if label_map: gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map] else: category_ids = [gt['category_id'] for gt in gt_annotations] gt_categories = [{'id': i} for i in set(category_ids)] gt_dataset = { 'images': gt_images, 'categories': gt_categories, 'annotations': copy.deepcopy(gt_annotations), } return gt_dataset
# imgIds = coco_ann.getImgIds() # img = coco_train.loadImgs(imgIds[9])[0] # img = coco_ann.loadImgs(imgIds[np.random.randint(0, len(imgIds))])[0] I = io.imread(img['coco_url']) plt.axis('off') plt.imshow(I) plt.show() # load and display instance annotations plt.imshow(I) plt.axis('off') annIds = coco_ann.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None) anns = coco_ann.loadAnns(annIds) print(anns) print(len(anns)) coco_ann.showAnns(anns, draw_bbox=False) plt.show() merge() if len(anns) != 0: for i, ann in enumerate(anns): mask = coco_ann.annToMask(ann) img = encode(mask) print(img) # io.imsave('/1.png', img) plt.imshow(mask) plt.savefig('1.jpg') plt.show()
def draw_panoptic_seg_predictions(self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5): frame_visualizer = Visualizer(frame, self.metadata) pred = _PanopticPrediction(panoptic_seg, segments_info) if self._instance_mode == ColorMode.IMAGE_BW: frame_visualizer.output.img = frame_visualizer._create_grayscale_image( pred.non_empty_mask()) # draw mask for all semantic segments first i.e. "stuff" for mask, sinfo in pred.semantic_masks(): category_idx = sinfo["category_id"] try: mask_color = [ x / 255 for x in self.metadata.stuff_colors[category_idx] ] except AttributeError: mask_color = None frame_visualizer.draw_binary_mask( mask, color=mask_color, text=self.metadata.stuff_classes[category_idx], alpha=alpha, area_threshold=area_threshold, ) all_instances = list(pred.instance_masks()) if len(all_instances) == 0: return frame_visualizer.output # draw mask for all instances second masks, sinfo = list(zip(*all_instances)) num_instances = len(masks) masks_rles = mask_util.encode( np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F")) assert len(masks_rles) == num_instances category_ids = [x["category_id"] for x in sinfo] detected = [ _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8) for i in range(num_instances) ] colors = self._assign_colors(detected) labels = [self.metadata.thing_classes[k] for k in category_ids] frame_visualizer.overlay_instances( boxes=None, masks=masks, labels=labels, keypoints=None, assigned_colors=colors, alpha=alpha, ) return frame_visualizer.output
# Increment ann id annId = annId + 1 # Add a comma and line break after each annotation outfile.write(unicode(',')) outfile.write(unicode('\n')) # Add stuff annotations for i, labelIdx in enumerate(labelsStuff): # Create mask and encode it labelMask = np.zeros((h, w)) labelMask[:, :] = S == labelIdx labelMask = np.expand_dims(labelMask, axis=2) labelMask = labelMask.astype('uint8') labelMask = np.asfortranarray(labelMask) Rs = mask.encode(labelMask) # Create annotation data anndata = {} anndata['id'] = annId anndata['image_id'] = imageIds[imageIdx] anndata['category_id'] = labelIdx - oldStuffStartIdx + newStuffStartIdx # Stuff classes start from 92 in v. 1.1 anndata['segmentation'] = Rs anndata['area'] = float(mask.area(Rs)) anndata['bbox'] = mask.toBbox(Rs).tolist() anndata['iscrowd'] = 1 # Write JSON str_ = json.dumps(anndata, indent=indent, sort_keys=True, separators=separators, ensure_ascii=ensure_ascii) outfile.write(unicode(str_))
def test_uncompressed_RLE(self): mask = make_mask() rle = mask_util.encode(np.asarray(mask, order="F")) uncompressed = uncompressed_rle(mask) compressed = mask_util.frPyObjects(uncompressed, *rle["size"]) self.assertEqual(rle, compressed)
image_name = image_name.replace("GT", "RGB") image_json = { "height": np_im.shape[0], "width": np_im.shape[1], "id": img_id, "file_name": image_name } mainjson["images"].append(image_json) ground_truth_binary_mask = np_im fortran_ground_truth_binary_mask = np.asfortranarray( ground_truth_binary_mask) encoded_ground_truth = mask.encode(fortran_ground_truth_binary_mask) ground_truth_area = mask.area(encoded_ground_truth) ground_truth_bounding_box = mask.toBbox(encoded_ground_truth) contours = measure.find_contours(ground_truth_binary_mask, 0.5) maskbits = im maskbits = np.array(maskbits) #mask = resize(mask, (768, 1024), preserve_range=True) maskbits = maskbits.astype(np.uint8) maskbits[maskbits < 255] = 0 maskbits[maskbits == 255] = 1 obj_ids = np.unique(maskbits) obj_ids = obj_ids[1:] masks = maskbits == obj_ids[:, None, None] num_objs = len(obj_ids)
def simple_test_mask(self, score_map, corner_offsets, img_meta, det_bboxes, rescale=False): ''' :param semantic_map: semantic map hxwx80 :param img_meta: :param det_bboxes: :param rescale: :return: ''' # TODO: solve hardcode semantic_map = (score_map > 0.4).astype('int') h, w, _ = semantic_map.shape instance_map = -np.ones_like(semantic_map) border_y, border_x = -img_meta['offset'] ori_h, ori_w, _ = img_meta['ori_shape'] _, img_h, img_w = img_meta['img_shape'] for label, bboxes in enumerate(det_bboxes): #keepinds = (bboxes[...,-1]>0.4) #bboxes = bboxes[keepinds] if (len(bboxes) == 0) or (semantic_map[..., label].sum() == 0): continue centers = np.array(bboxes)[..., :4] centers[..., 0::2] += border_x centers[..., 1::2] += border_y pixels = semantic_map[..., label] #pdb.set_trace() if len(bboxes) == 1: instance_map[..., label] = pixels - 1 else: for y in range(h): for x in range(w): if pixels[y, x] == 0: continue tl_x = 4 * (x + corner_offsets[label, y, x]) - 1 tl_y = 4 * (y + corner_offsets[label + 80, y, x]) - 1 br_x = 4 * (x + corner_offsets[label + 160, y, x]) - 1 br_y = 4 * (y + corner_offsets[label + 240, y, x]) - 1 #pdb.set_trace() instance_map[y, x, label] = KNN_cluster( centers, np.array([tl_x, tl_y, br_x, br_y])) #seg_maps = [] cls_segms = [[] for _ in range(80)] for label in range(80): map_with_id = instance_map[..., label] if map_with_id.max() == -1: continue for ins_id in range(map_with_id.max() + 1): seg_map = (map_with_id == ins_id).astype('float32') seg_map *= score_map[..., label] seg_map = cv2.resize(seg_map, (img_w, img_h)) seg_map = (seg_map > 0.4).astype('int') #seg_map = seg_map[border_y:border_y + ori_h, border_x:border_x + ori_w] if seg_map.sum() == 0: continue seg_map = np.uint8(seg_map) rle = mask_util.encode( np.array(seg_map[:, :, np.newaxis], order='F'))[0] #rle['counts'].decode() #cls_segms[label].append(rle) cls_segms[label].append(seg_map) #pdb.set_trace() return cls_segms