def __call__(self, image, target): # Ensure always return cropped image while True: mode = random.choice(self.sample_options) if mode is None: # 不做随机裁剪处理 return image, target htot, wtot = target['height_width'] min_iou, max_iou = mode min_iou = float('-inf') if min_iou is None else min_iou max_iou = float('+inf') if max_iou is None else max_iou # Implementation use 5 iteration to find possible candidate for _ in range(5): # 0.3*0.3 approx. 0.1 w = random.uniform(0.3, 1.0) h = random.uniform(0.3, 1.0) if w / h < 0.5 or w / h > 2: # 保证宽高比例在0.5-2之间 continue # left 0 ~ wtot - w, top 0 ~ htot - h left = random.uniform(0, 1.0 - w) top = random.uniform(0, 1.0 - h) right = left + w bottom = top + h # boxes的坐标是在0-1之间的 bboxes = target["boxes"] ious = calc_iou_tensor( bboxes, torch.tensor([[left, top, right, bottom]])) # tailor all the bboxes and return # all(): Returns True if all elements in the tensor are True, False otherwise. if not ((ious > min_iou) & (ious < max_iou)).all(): continue # discard any bboxes whose center not in the cropped image xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2]) yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3]) # 查找所有的gt box的中心点有没有在采样patch中的 masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom) # if no such boxes, continue searching again # 如果所有的gt box的中心点都不在采样的patch中,则重新找 if not masks.any(): continue # 修改采样patch中的所有gt box的坐标(防止出现越界的情况) bboxes[bboxes[:, 0] < left, 0] = left bboxes[bboxes[:, 1] < top, 1] = top bboxes[bboxes[:, 2] > right, 2] = right bboxes[bboxes[:, 3] > bottom, 3] = bottom # 虑除不在采样patch中的gt box bboxes = bboxes[masks, :] # 获取在采样patch中的gt box的标签 labels = target['labels'] labels = labels[masks] # 裁剪patch left_idx = int(left * wtot) top_idx = int(top * htot) right_idx = int(right * wtot) bottom_idx = int(bottom * htot) image = image.crop((left_idx, top_idx, right_idx, bottom_idx)) # 调整裁剪后的bboxes坐标信息 bboxes[:, 0] = (bboxes[:, 0] - left) / w bboxes[:, 1] = (bboxes[:, 1] - top) / h bboxes[:, 2] = (bboxes[:, 2] - left) / w bboxes[:, 3] = (bboxes[:, 3] - top) / h # 更新crop后的gt box坐标信息以及标签信息 target['boxes'] = bboxes target['labels'] = labels return image, target
def forward(self, ploc, plabel, gloc, glabel): # type: (Tensor, Tensor, Tensor, Tensor) """ ploc, plabel: Nx4x8732, Nxlabel_numx8732 predicted location and labels gloc, glabel: Nx4x8732, Nx8732 ground truth location and labels """ dbox_ious_max = torch.zeros(8, 5440).cuda('cuda:0') dbox_ious_New = torch.zeros(8, 5440).cuda('cuda:0') c_ofmaxIOU = torch.zeros(8, 5440).cuda('cuda:0') # 获取正样本的mask Tensor: [N, 8732] mask = glabel > 0 # mask1 = torch.nonzero(glabel) ploc_cxcy = self._location_vec_inverse(ploc) ploc_cxcy_ltwb = self._xywh2ltrb(ploc_cxcy) gloc_ltrb = self._xywh2ltrb(gloc) # gloc_ltrb_tmp = gloc_ltrb.permute(0,2,1) # gboxes_ltrb = gloc_ltrb_tmp[mask] for kk in range(8): maskTmp = glabel[kk, :] > 0 gboxes_ltrb = gloc_ltrb[kk, :, maskTmp] gboxes_ltrb = gboxes_ltrb.transpose(0, 1) pboxes_oneImg = ploc_cxcy_ltwb[kk, :, :] pboxes_oneImg = pboxes_oneImg.transpose(0, 1) ious_pbox_gt = calc_iou_tensor(gboxes_ltrb, pboxes_oneImg) # [nboxes, 8732] best_truth_ious, best_truth_idx = ious_pbox_gt.max( dim=0) # 寻找每个default box匹配到的最大IoU bboxes_in # best_dbox_ious, best_dbox_idx = ious_pbox_gt.max(dim=1) # matches = gboxes_ltrb[best_truth_idx] # c_pbox_gt = calc_c_tensor(matches, pboxes_oneImg) # c_ofmaxIOU[kk,:]=c_pbox_gt dbox_ious_max[kk, :] = best_truth_ious # modification for iou loss tmpxx = gloc_ltrb[kk, :, :] ious_pbox_gt_New = calc_iou_tensor_diag(tmpxx.permute( 1, 0), pboxes_oneImg) # [nboxes, 8732] # diag_ious_pbox_gt = ious_pbox_gt_New.diagonal() # pos_ious_pbox = diag_ious_pbox_gt[maskTmp] dbox_ious_New[kk, :] = ious_pbox_gt_New # iou_loss = 1 - dbox_ious_max iou_loss = 1 - dbox_ious_New # iou_loss = torch.sqrt(iou_loss) # 计算一个batch中的每张图片的正样本个数 Tensor: [N] pos_num = mask.sum(dim=1) # 计算gt的location回归参数 Tensor: [N, 4, 8732] vec_gd = self._location_vec(gloc) pboxes = self._location_vec_inverse(ploc) # add loss ratio with torch.no_grad(): tmp = 2 * self.dboxes[:, 2:, :] / (pboxes[:, 2:, :] + gloc[:, 2:, :]) / self.scale_xy # vec_gd1 = vec_gd * 1 # ploc1 = ploc * 1 vec_gd[:, :2, :] = vec_gd[:, :2, :] * tmp ploc[:, :2, :] = ploc[:, :2, :] * tmp # if tmp.max() > 2: # print('haha') # sum on four coordinates, and mask # 计算定位损失(只有正样本) loc_loss = 2 * (iou_loss + self.location_loss( ploc[:, :2, :], vec_gd[:, :2, :]).sum(dim=1)) # Tensor: [N, 8732] loc_loss = (mask.float() * loc_loss).sum(dim=1) # Tenosr: [N] # loc_loss1 = self.location_loss(ploc1, vec_gd1).sum(dim=1) # Tensor: [N, 8732] # loc_loss1 = (mask.float() * loc_loss1).sum(dim=1) # Tenosr: [N] # hard negative mining Tenosr: [N, 8732] # con1 = self.confidence_loss(plabel, glabel) con = self.cross_entropy_Iou(plabel, glabel, dbox_ious_New.unsqueeze(dim=1), isPositive=True) # positive mask will never selected # 获取负样本 # con_neg1 = con.clone() con_neg = self.cross_entropy_Iou(plabel, glabel, dbox_ious_max.unsqueeze(dim=1), isPositive=False) con_neg[mask] = torch.tensor(0.0) # 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732]) _, con_idx = con_neg.sort(dim=1, descending=True) _, con_rank = con_idx.sort(dim=1) # 这个步骤比较巧妙 # number of negative three times positive # 用于损失计算的负样本数是正样本的3倍(在原论文Hard negative mining部分), # 但不能超过总样本数8732 neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1) neg_mask = con_rank < neg_num # Tensor [N, 8732] # confidence最终loss使用选取的正样本loss+选取的负样本loss con_loss = (con * mask.float() + con_neg * neg_mask.float()).sum( dim=1) # Tensor [N] # avoid no object detected # 避免出现图像中没有GTBOX的情况 total_loss = loc_loss + con_loss num_mask = (pos_num > 0).float() # 统计一个batch中的每张图像中是否存在GTBOX pos_num = pos_num.float().clamp(min=1e-6) # 防止出现分母为零的情况 ret = (total_loss * num_mask / pos_num).mean(dim=0) # 只计算存在GTBOX的图像损失 return ret