def after_nms(ids_p, class_p, box_p, coef_p, proto_p, img_h, img_w, cfg=None, img_name=None): if ids_p is None: return None, None, None, None if cfg and cfg.visual_thre > 0: keep = class_p >= cfg.visual_thre if not keep.any(): return None, None, None, None ids_p = ids_p[keep] class_p = class_p[keep] box_p = box_p[keep] coef_p = coef_p[keep] if cfg and cfg.save_lincomb: draw_lincomb(proto_p, coef_p, img_name) masks = torch.sigmoid(torch.matmul(proto_p, coef_p.t())) if not cfg or not cfg.no_crop: # Crop masks by box_p masks = crop(masks, box_p) masks = masks.permute(2, 0, 1).contiguous() ori_size = max(img_h, img_w) # in OpenCV, cv2.resize is `align_corners=False`. masks = F.interpolate(masks.unsqueeze(0), (ori_size, ori_size), mode='bilinear', align_corners=False).squeeze(0) masks.gt_(0.5) # Binarize the masks because of interpolation. masks = masks[:, 0: img_h, :] if img_h < img_w else masks[:, :, 0: img_w] box_p *= ori_size box_p = box_p.int() return ids_p, class_p, box_p, masks
def lincomb_mask_loss(self, pos_bool, anchor_max_i, coef_p, proto_p, mask_gt, anchor_max_gt): proto_h, proto_w = proto_p.shape[1:3] total_pos_num = pos_bool.sum() loss_m = 0 for i in range(coef_p.size(0)): # coef_p.shape: (n, 19248, 32) # downsample the gt mask to the size of 'proto_p' downsampled_masks = F.interpolate(mask_gt[i].unsqueeze(0), (proto_h, proto_w), mode='bilinear', align_corners=False).squeeze(0) downsampled_masks = downsampled_masks.permute( 1, 2, 0).contiguous() # (138, 138, num_objects) # binarize the gt mask because of the downsample operation downsampled_masks = downsampled_masks.gt(0.5).float() pos_anchor_i = anchor_max_i[i][pos_bool[i]] pos_anchor_box = anchor_max_gt[i][pos_bool[i]] pos_coef = coef_p[i][pos_bool[i]] if pos_anchor_i.size(0) == 0: continue # If exceeds the number of masks for training, select a random subset old_num_pos = pos_coef.size(0) if old_num_pos > self.cfg.masks_to_train: perm = torch.randperm(pos_coef.size(0)) select = perm[:self.cfg.masks_to_train] pos_coef = pos_coef[select] pos_anchor_i = pos_anchor_i[select] pos_anchor_box = pos_anchor_box[select] num_pos = pos_coef.size(0) pos_mask_gt = downsampled_masks[:, :, pos_anchor_i] # mask assembly by linear combination # @ means dot product mask_p = torch.sigmoid( proto_p[i] @ pos_coef.t()) # mask_p.shape: (138, 138, num_pos) mask_p = crop(mask_p, pos_anchor_box) # pos_anchor_box.shape: (num_pos, 4) # TODO: grad out of gt box is 0, should it be modified? # TODO: need an upsample before computing loss? mask_loss = F.binary_cross_entropy(torch.clamp(mask_p, 0, 1), pos_mask_gt, reduction='none') # aa = -pos_mask_gt*torch.log(mask_p) - (1-pos_mask_gt) * torch.log(1-mask_p) # Normalize the mask loss to emulate roi pooling's effect on loss. anchor_area = (pos_anchor_box[:, 2] - pos_anchor_box[:, 0]) * ( pos_anchor_box[:, 3] - pos_anchor_box[:, 1]) mask_loss = mask_loss.sum(dim=(0, 1)) / anchor_area if old_num_pos > num_pos: mask_loss *= old_num_pos / num_pos loss_m += torch.sum(mask_loss) return self.cfg.mask_alpha * loss_m / proto_h / proto_w / total_pos_num
def after_nms(nms_outs, img_h, img_w, show_lincomb=False, crop_masks=True, visual_thre=0, img_name=None): if nms_outs is None: return [torch.Tensor() ] * 4 # Warning, this is 4 copies of the same thing if visual_thre > 0: keep = nms_outs['class'] > visual_thre for k in nms_outs: if k != 'proto': nms_outs[k] = nms_outs[k][keep] if nms_outs['class'].size(0) == 0: return [torch.Tensor()] * 4 class_ids = nms_outs['class_ids'] boxes = nms_outs['box'] classes = nms_outs['class'] coefs = nms_outs['coef'] # At this points masks is only the coefficients proto_data = nms_outs['proto'] if show_lincomb: draw_lincomb(proto_data, coefs, img_name) masks = torch.sigmoid(torch.matmul(proto_data, coefs.t())) # Crop masks by boxes if crop_masks: masks = crop(masks, boxes) masks = masks.permute(2, 0, 1).contiguous() masks = F.interpolate(masks.unsqueeze(0), (img_h, img_w), mode='bilinear', align_corners=False).squeeze(0) # Binarize the masks masks.gt_(0.5) boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], img_w) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], img_h) boxes = boxes.long() return class_ids, classes, boxes, masks
def lincomb_mask_loss(positive_bool, prior_max_index, coef_p, proto_p, mask_gt, prior_max_box): proto_h = proto_p.size(1) # 138 proto_w = proto_p.size(2) # 138 loss_m = 0 for i in range(coef_p.size(0)): # coef_p.shape: (n, 19248, 32) with torch.no_grad(): # downsample the gt mask to the size of 'proto_p' downsampled_masks = F.interpolate(mask_gt[i].unsqueeze(0), (proto_h, proto_w), mode='bilinear', align_corners=False).squeeze(0) downsampled_masks = downsampled_masks.permute(1, 2, 0).contiguous() # (138, 138, num_objects) # binarize the gt mask because of the downsample operation downsampled_masks = downsampled_masks.gt(0.5).float() pos_prior_index = prior_max_index[i, positive_bool[i]] # pos_prior_index.shape: [num_positives] pos_prior_box = prior_max_box[i, positive_bool[i]] pos_coef = coef_p[i, positive_bool[i]] if pos_prior_index.size(0) == 0: continue # If exceeds the number of masks for training, select a random subset old_num_pos = pos_coef.size(0) if old_num_pos > cfg.masks_to_train: perm = torch.randperm(pos_coef.size(0)) select = perm[:cfg.masks_to_train] pos_coef = pos_coef[select] pos_prior_index = pos_prior_index[select] pos_prior_box = pos_prior_box[select] num_pos = pos_coef.size(0) pos_mask_gt = downsampled_masks[:, :, pos_prior_index] # mask assembly by linear combination # @ means dot product mask_p = torch.sigmoid(proto_p[i] @ pos_coef.t()) # mask_p.shape: (138, 138, num_pos) mask_p = crop(mask_p, pos_prior_box) # pos_prior_box.shape: (num_pos, 4) mask_loss = F.binary_cross_entropy(torch.clamp(mask_p, 0, 1), pos_mask_gt, reduction='none') # Normalize the mask loss to emulate roi pooling's effect on loss. pos_get_csize = center_size(pos_prior_box) mask_loss = mask_loss.sum(dim=(0, 1)) / pos_get_csize[:, 2] / pos_get_csize[:, 3] if old_num_pos > num_pos: mask_loss *= old_num_pos / num_pos loss_m += torch.sum(mask_loss) loss_m *= cfg.mask_alpha / proto_h / proto_w return loss_m
def after_nms(nms_outs, img_h, img_w, cfg=None, img_name=None): if nms_outs is None: return [torch.Tensor()] * 4 if cfg and cfg.visual_thre > 0: keep = nms_outs['class'] >= cfg.visual_thre for k in nms_outs: if k != 'proto': nms_outs[k] = nms_outs[k][keep] if nms_outs['class'].size(0) == 0: return [torch.Tensor()] * 4 class_ids = nms_outs['class_ids'] boxes = nms_outs['box'] classes = nms_outs['class'] coefs = nms_outs['coef'] proto_data = nms_outs['proto'] if cfg and cfg.save_lincomb: draw_lincomb(proto_data, coefs, img_name) masks = torch.sigmoid(torch.matmul(proto_data, coefs.t())) if not cfg or not cfg.no_crop: # Crop masks by boxes masks = crop(masks, boxes) masks = masks.permute(2, 0, 1).contiguous() masks = F.interpolate(masks.unsqueeze(0), (img_h, img_w), mode='bilinear', align_corners=False).squeeze(0) masks.gt_(0.5) # Binarize the masks boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], img_w) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], img_h) boxes = boxes.long() return class_ids, classes, boxes, masks