def prepare(self, img): """Preprocess an image for feature extraction. The length of the shorter edge is scaled to :obj:`self.min_size`. After the scaling, if the length of the longer edge is longer than :obj:`self.max_size`, the image is scaled to fit the longer edge to :obj:`self.max_size`. After resizing the image, the image is subtracted by a mean image value :obj:`self.mean`. Args: img (~numpy.ndarray): An image. This is in CHW and RGB format. The range of its value is :math:`[0, 255]`. Returns: ~numpy.ndarray: A preprocessed image. """ _, H, W = img.shape scale = 1. scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img
def _mask_aggregation(bbox, cmask_prob, cmask_weight, size, binary_thresh): assert bbox.shape[0] == len(cmask_prob) assert bbox.shape[0] == cmask_weight.shape[0] aggregated_msk = np.zeros(size, dtype=np.float32) for bb, cmsk_pb, cmsk_w in zip(bbox, cmask_prob, cmask_weight): bb = np.round(bb).astype(np.int32) y_min, x_min, y_max, x_max = bb if y_max - y_min > 0 and x_max - x_min > 0: cmsk_pb = resize( cmsk_pb.astype(np.float32)[None], (y_max - y_min, x_max - x_min)) cmsk_m = (cmsk_pb >= binary_thresh).astype(np.float32)[0] aggregated_msk[y_min:y_max, x_min:x_max] += cmsk_m * cmsk_w y_indices, x_indices = np.where(aggregated_msk >= binary_thresh) if len(y_indices) == 0 or len(x_indices) == 0: return None, None else: y_max = y_indices.max() + 1 y_min = y_indices.min() x_max = x_indices.max() + 1 x_min = x_indices.min() aggregated_bb = np.array([y_min, x_min, y_max, x_max], dtype=np.float32) aggregated_cmsk = aggregated_msk[y_min:y_max, x_min:x_max] return aggregated_cmsk[None], aggregated_bb[None]
def prepare(self, img): """Preprocess an image for feature extraction. The length of the shorter edge is scaled to :obj:`self.min_size`. After the scaling, if the length of the longer edge is longer than :obj:`self.max_size`, the image is scaled to fit the longer edge to :obj:`self.max_size`. After resizing the image, the image is subtracted by a mean image value :obj:`self.mean`. Args: img (~numpy.ndarray): An image. This is in CHW and RGB format. The range of its value is :math:`[0, 255]`. Returns: ~numpy.ndarray: A preprocessed image. """ _, H, W = img.shape scale = 1. scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img
def prepare_img(self, img, infer=False): """Preprocess an image for feature extraction. The length of the shorter edge is scaled to :obj:`conf.min_size`. After the scaling, if the length of the longer edge is longer than :obj:`conf.max_size`, the image is scaled to fit the longer edge to :obj:`conf.max_size`. Args: img (np.array): RGB img [3,H,W] Returns: A preprocessed image. resize scale """ W, H = img.shape[2], img.shape[1] if infer: min_size = self.min_sizes[-1] else: min_size = random.choice(self.min_sizes) scale = min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img, scale
def prepare(self, img): _, H, W = img.shape scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img
def prepare(self, orig_img, min_size=600, max_size=1000): img = orig_img.copy() img = img.astype(np.float32) _, H, W = img.shape scale = min_size / min(H, W) if scale * max(H, W) > max_size: scale = max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = img.transpose((1, 2, 0)) # C, H, W -> H, W, C img -= self.mean_bgr[::-1] img = img.transpose((2, 0, 1)) # H, W, C -> C, H, W return img
def prepare(self, img): _, H, W = img.shape scale = 1. scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) # 元のコードは平均を引くだけ、だったんだけど、なんか[0,1]にするだけでうまくいかないかなぁ img = img.astype(np.float32) / 255 return img
def __call__(self, in_data): if len(in_data) == 5: img, label, bbox, mask, i = in_data elif len(in_data) == 4: img, bbox, label, i = in_data _, H, W = img.shape img = self.net.prepare(img) _, o_H, o_W = img.shape scale = o_H / H if len(bbox) == 0: return img, [], [], 1 bbox = resize_bbox(bbox, (H, W), (o_H, o_W)) mask = resize(mask, (o_H, o_W)) #horizontal flip img, params = transforms.random_flip(img, x_random=True, return_param=True) bbox = transforms.flip_bbox(bbox, (o_H, o_W), x_flip=params['x_flip']) mask = transforms.flip(mask, x_flip=params['x_flip']) cv2.imwrite("gt_roi.png", mask[0] * 255) return img, bbox, label, scale, mask
def prepare_img(conf, img, resolution=-1): """Preprocess an image for feature extraction. The length of the shorter edge is scaled to :obj:`conf.min_size`. After the scaling, if the length of the longer edge is longer than :obj:`conf.max_size`, the image is scaled to fit the longer edge to :obj:`conf.max_size`. Args: img (np.array): RGB img [3,H,W] Returns: A preprocessed image. resize scale """ W, H = img.shape[2], img.shape[1] min_size = conf.min_sizes[resolution] scale = min_size / min(H, W) if scale * max(H, W) > conf.max_size: scale = conf.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - conf.mean).astype(np.float32, copy=False) return img, scale
def __call__(self, in_data): if len(in_data)==5: img, label, bbox, mask, i = in_data elif len(in_data)==4: img, bbox, label, i= in_data label = [self.labelids.index(l) + 1 for l in label] _, H, W = img.shape if chainer.config.train: img = self.net.prepare(img) _, o_H, o_W = img.shape scale = o_H / H if len(bbox)==0: return img, [],[],1 bbox = resize_bbox(bbox, (H, W), (o_H, o_W)) mask = resize(mask,(o_H, o_W)) if chainer.config.train: #horizontal flip img, params = transforms.random_flip( img, x_random=True, return_param=True) bbox = transforms.flip_bbox( bbox, (o_H, o_W), x_flip=params['x_flip']) mask = transforms.flip(mask, x_flip=params['x_flip']) return img, bbox, label, scale, mask, i
def __call__( self, roi, mask, label, bbox, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.2, 0.2, 0.5, 0.5), mask_size=(21, 21), ): """Assigns ground truth to sampled proposals. This function samples total of :obj:`self.n_sample` RoIs from the combination of :obj:`roi`, :obj:`mask`, :obj:`label` and :obj: `bbox`. The RoIs are assigned with the ground truth class labels as well as bounding box offsets and scales to match the ground truth bounding boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are sampled as foregrounds. Offsets and scales of bounding boxes are calculated using :func:`chainercv.links.model.faster_rcnn.bbox2loc`. Also, types of input arrays and output arrays are same. Here are notations. * :math:`S` is the total number of sampled RoIs, which equals \ :obj:`self.n_sample`. * :math:`L` is number of object classes possibly including the \ background. * :math:`H` is the image height. * :math:`W` is the image width. * :math:`RH` is the mask height. * :math:`RW` is the mask width. Args: roi (array): Region of Interests (RoIs) from which we sample. Its shape is :math:`(R, 4)` mask (array): The coordinates of ground truth masks. Its shape is :math:`(R', H, W)`. label (array): Ground truth bounding box labels. Its shape is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where :math:`L` is the number of foreground classes. bbox (array): The coordinates of ground truth bounding boxes. Its shape is :math:`(R', 4)`. loc_normalize_mean (tuple of four floats): Mean values to normalize coordinates of bounding boxes. loc_normalize_std (tuple of four floats): Standard deviation of the coordinates of bounding boxes. mask_size (tuple of int or int): Generated mask size, which is equal to :math:`(RH, RW)`. Returns: (array, array, array, array): * **sample_roi**: Regions of interests that are sampled. \ Its shape is :math:`(S, 4)`. * **gt_roi_mask**: Masks assigned to sampled RoIs. Its shape is \ :math:`(S, RH, RW)`. * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ value 0 is the background. * **gt_roi_loc**: Offsets and scales to match \ the sampled RoIs to the ground truth bounding boxes. \ Its shape is :math:`(S, 4)`. """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) mask = cuda.to_cpu(mask) label = cuda.to_cpu(label) bbox = cuda.to_cpu(bbox) if not isinstance(mask_size, tuple): mask_size = (mask_size, mask_size) n_bbox, _ = bbox.shape roi = np.concatenate((roi, bbox), axis=0) if self.n_sample is None: n_sample = roi.shape[0] else: n_sample = self.n_sample pos_roi_per_image = np.round(n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both foreground and background). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] # locs # Compute offsets and scales to match sampled RoIs to the GTs. loc_normalize_mean = np.array(loc_normalize_mean, np.float32) loc_normalize_std = np.array(loc_normalize_std, np.float32) gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = gt_roi_loc - loc_normalize_mean gt_roi_loc = gt_roi_loc / loc_normalize_std # masks gt_roi_mask = -1 * np.ones( (len(keep_index), mask_size[0], mask_size[1]), dtype=np.int32) for i, pos_ind in enumerate(pos_index): bb = np.round(sample_roi[i]).astype(np.int) gt_msk = mask[gt_assignment[pos_ind]] gt_roi_msk = gt_msk[bb[0]:bb[2], bb[1]:bb[3]] gt_roi_msk = resize( gt_roi_msk.astype(np.float32)[None], mask_size)[0] gt_roi_msk = (gt_roi_msk >= self.binary_thresh).astype(np.int) gt_roi_mask[i] = gt_roi_msk if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_mask = cuda.to_gpu(gt_roi_mask) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_loc = cuda.to_gpu(gt_roi_loc) return sample_roi, gt_roi_mask, gt_roi_label, gt_roi_loc
def predict(self, imgs): """Segment object instances from images. This method predicts instance-aware object regions for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images of shape :math:`(B, C, H, W)`. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(masks, labels, scores)`. * **masks**: A list of boolean arrays of shape :math:`(R, H, W)`, \ where :math:`R` is the number of masks in a image. \ Each pixel holds value if it is inside the object inside or not. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the masks. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ prepared_imgs = [] sizes = [] for img in imgs: size = img.shape[1:] img = self.prepare(img.astype(np.float32)) prepared_imgs.append(img) sizes.append(size) masks = [] labels = [] scores = [] for img, size in zip(prepared_imgs, sizes): with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): # inference img_var = chainer.Variable(self.xp.array(img[None])) scale = img_var.shape[3] / size[1] roi_ag_seg_scores, _, roi_cls_scores, bboxes, _ = \ self.__call__(img_var, scale) # We are assuming that batch size is 1. roi_ag_seg_score = chainer.cuda.to_cpu(roi_ag_seg_scores.array) roi_cls_score = chainer.cuda.to_cpu(roi_cls_scores.array) bbox = chainer.cuda.to_cpu(bboxes) # filter bounding boxes with min_size height = bbox[:, 2] - bbox[:, 0] width = bbox[:, 3] - bbox[:, 1] keep_indices = np.where( (height >= self.min_drop_size) & (width >= self.min_drop_size))[0] roi_ag_seg_score = roi_ag_seg_score[keep_indices, :, :] roi_cls_score = roi_cls_score[keep_indices] bbox = bbox[keep_indices, :] # scale bbox bbox = bbox / scale # shape: (n_rois, 4) bbox[:, 0::2] = self.xp.clip(bbox[:, 0::2], 0, size[0]) bbox[:, 1::2] = self.xp.clip(bbox[:, 1::2], 0, size[1]) # shape: (n_roi, roi_size, roi_size) roi_seg_prob = F.softmax(roi_ag_seg_score).array[:, 1] roi_cls_prob = F.softmax(roi_cls_score).array roi_seg_prob, bbox, label, roi_cls_prob = mask_voting( roi_seg_prob, bbox, roi_cls_prob, size, self.score_thresh, self.nms_thresh, self.mask_merge_thresh, self.binary_thresh, limit=self.limit, bg_label=0) mask = np.zeros( (len(roi_seg_prob), size[0], size[1]), dtype=np.bool) for i, (roi_seg_pb, bb) in enumerate(zip(roi_seg_prob, bbox)): bb = np.round(bb).astype(np.int32) y_min, x_min, y_max, x_max = bb roi_msk_pb = resize( roi_seg_pb.astype(np.float32)[None], (y_max - y_min, x_max - x_min)) roi_msk = (roi_msk_pb > self.binary_thresh)[0] mask[i, y_min:y_max, x_min:x_max] = roi_msk masks.append(mask) labels.append(label) scores.append(roi_cls_prob) return masks, labels, scores
def mask_voting(roi_cmask_prob, bbox, roi_cls_prob, size, score_thresh, nms_thresh, mask_merge_thresh, binary_thresh, limit=100, bg_label=0): """Refine mask probabilities by merging multiple masks. First, this function discard invalid masks with non maximum suppression. Then, it merges masks with weight calculated from class probabilities and iou. This function improves the mask qualities by merging overlapped masks predicted as the same object class. Here are notations used. * :math:`R'` is the total number of RoIs produced across batches. * :math:`L` is the number of classes excluding the background. * :math:`RH` is the height of pooled image. * :math:`RW` is the height of pooled image. Args: roi_cmask_prob (array): A mask probability array whose shape is :math:`(R, RH, RW)`. bbox (array): A bounding box array whose shape is :math:`(R, 4)`. cls_prob (array): A class probability array whose shape is :math:`(R, L + 1)`. size (tuple of int): Original image size. score_thresh (float): A threshold value of the class score. nms_thresh (float): A threshold value of non maximum suppression. mask_merge_thresh (float): A threshold value of the bounding box iou for mask merging. binary_thresh (float): A threshold value of mask score for mask merging. limit (int): The maximum number of outputs. bg_label (int): The id of the background label. Returns: array, array, array, array: * **v_cmask_prob**: Merged mask probability. Its shapes is \ :math:`(N, RH, RW)`. * **v_bbox**: Bounding boxes for the merged masks. Its shape is \ :math:`(N, 4)`. * **v_label**: Class labels for the merged masks. Its shape is \ :math:`(N, )`. * **v_score**: Class probabilities for the merged masks. Its shape \ is :math:`(N, )`. """ roi_cmask_size = roi_cmask_prob.shape[1:] n_class = roi_cls_prob.shape[1] v_cmask_prob = [] v_bbox = [] v_label = [] v_cls_prob = [] cls_score = [] cls_bbox = [] for label in range(0, n_class): # background if label == bg_label: continue # non maximum suppression score_l = roi_cls_prob[:, label] keep_indices = non_maximum_suppression(bbox, nms_thresh, score_l) bbox_l = bbox[keep_indices] score_l = score_l[keep_indices] cls_bbox.append(bbox_l) cls_score.append(score_l) sorted_score = np.sort(np.concatenate(cls_score))[::-1] n_keep = min(len(sorted_score), limit) score_thresh = max(sorted_score[n_keep - 1], score_thresh) for label in range(0, n_class): # background if label == bg_label: continue bbox_l = cls_bbox[label - 1] score_l = cls_score[label - 1] keep_indices = np.where(score_l >= score_thresh) bbox_l = bbox_l[keep_indices] score_l = score_l[keep_indices] v_cmask_prob_l = [] v_bbox_l = [] v_score_l = [] for i, bb in enumerate(bbox_l): iou = bbox_iou(bbox, bb[np.newaxis, :]) keep_indices = np.where(iou >= mask_merge_thresh)[0] cmask_weight = roi_cls_prob[keep_indices, label] cmask_weight = cmask_weight / cmask_weight.sum() cmask_prob_i = roi_cmask_prob[keep_indices] bbox_i = bbox[keep_indices] m_cmask, m_bbox = _mask_aggregation(bbox_i, cmask_prob_i, cmask_weight, size, binary_thresh) if m_cmask is not None and m_bbox is not None: m_cmask = resize(m_cmask.astype(np.float32), roi_cmask_size) v_cmask_prob_l.append(m_cmask) v_bbox_l.append(m_bbox) v_score_l.append(score_l[i]) if len(v_cmask_prob_l) > 0: v_cmask_prob_l = np.concatenate(v_cmask_prob_l) v_bbox_l = np.concatenate(v_bbox_l) v_score_l = np.array(v_score_l) v_label_l = np.repeat(label - 1, v_bbox_l.shape[0]) v_label_l = v_label_l.astype(np.int32) v_cmask_prob.append(v_cmask_prob_l) v_bbox.append(v_bbox_l) v_label.append(v_label_l) v_cls_prob.append(v_score_l) if len(v_cmask_prob) > 0: v_cmask_prob = np.concatenate(v_cmask_prob) v_bbox = np.concatenate(v_bbox) v_label = np.concatenate(v_label) v_cls_prob = np.concatenate(v_cls_prob) else: v_cmask_prob = np.empty((0, roi_cmask_size[0], roi_cmask_size[1])) v_bbox = np.empty((0, 4)) v_label = np.empty((0, )) v_cls_prob = np.empty((0, )) return v_cmask_prob, v_bbox, v_label, v_cls_prob