def detect_loss(self, cls_score, rois_label, bbox_pred, rois_target, rois_inside_ws, rois_outside_ws): # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) # classification loss RCNN_loss_cls = F.cross_entropy( cls_score, rois_label) # cls_score: [N, 2], rois_label: [N] return RCNN_loss_cls, RCNN_loss_bbox
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1,2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1,2,3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # (batch_size/5L, rois_nums/128L, 5L) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # pooled_feat: (batch_size*rois_nums/640L, channels/1024L, pooled_height/7L, pooled_width/7L) # pooled_feat: (batch_size*rois_nums/640L, channels/2048L) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map # Bottom-up c1 = self.RCNN_layer0(im_data) c2 = self.RCNN_layer1(c1) c3 = self.RCNN_layer2(c2) c4 = self.RCNN_layer3(c3) c5 = self.RCNN_layer4(c4) c6 = self.RCNN_layer5(c5) # Top-down p6 = self.RCNN_toplayer(c6) p5 = self.RCNN_latlayer1(c5) + p6 p4 = self.RCNN_latlayer2(c4) + p5 p3 = self._upsample_add(p4, self.RCNN_latlayer3(c3)) p3 = self.RCNN_smooth1(p3) p2 = self._upsample_add(p3, self.RCNN_latlayer4(c2)) p2 = self.RCNN_smooth2(p2) rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( rpn_feature_maps, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) # print('before pooling, cfg', cfg.POOLING_MODE) # print('before pooling, get_cfg', get_cfg().POOLING_MODE) # pooling features based on rois, output 14x14 map roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) # feed pooled features to top model pooled_feat = self._head_to_tail(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) # cls_prob = F.softmax(cls_score) ----------------not be used --------------- RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # loss (cross entropy) for object classification RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # loss (l1-norm) for bounding box regression RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) # cls_prob = cls_prob.view(batch_size, -1, cls_prob.size(1)) ----------------not be used --------------- bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) # 2nd----------------------------- # decode rois = bbox_decode(rois, bbox_pred, batch_size, self.class_agnostic, self.n_classes, im_info, self.training) # proposal_target if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, stage=2) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) # feed pooled features to top model pooled_feat = self._head_to_tail_2nd(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred_2nd(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score_2nd(pooled_feat) # cls_prob_2nd = F.softmax(cls_score) ----------------not be used --------------- RCNN_loss_cls_2nd = 0 RCNN_loss_bbox_2nd = 0 if self.training: # loss (cross entropy) for object classification RCNN_loss_cls_2nd = F.cross_entropy(cls_score, rois_label) # loss (l1-norm) for bounding box regression RCNN_loss_bbox_2nd = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) # cls_prob_2nd = cls_prob_2nd.view(batch_size, -1, cls_prob_2nd.size(1)) ----------------not be used --------- bbox_pred_2nd = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) # 3rd--------------- # decode rois = bbox_decode(rois, bbox_pred_2nd, batch_size, self.class_agnostic, self.n_classes, im_info, self.training) # proposal_target # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, stage=3) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) # feed pooled features to top model pooled_feat = self._head_to_tail_3rd(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred_3rd(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score_3rd(pooled_feat) cls_prob_3rd = F.softmax(cls_score) RCNN_loss_cls_3rd = 0 RCNN_loss_bbox_3rd = 0 if self.training: # loss (cross entropy) for object classification RCNN_loss_cls_3rd = F.cross_entropy(cls_score, rois_label) # loss (l1-norm) for bounding box regression RCNN_loss_bbox_3rd = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) cls_prob_3rd = cls_prob_3rd.view(batch_size, -1, cls_prob_3rd.size(1)) bbox_pred_3rd = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) if not self.training: # 3rd_avg # 1st_3rd pooled_feat_1st_3rd = self._head_to_tail(roi_pool_feat) cls_score_1st_3rd = self.RCNN_cls_score(pooled_feat_1st_3rd) cls_prob_1st_3rd = F.softmax(cls_score_1st_3rd) cls_prob_1st_3rd = cls_prob_1st_3rd.view(batch_size, -1, cls_prob_1st_3rd.size(1)) # 2nd_3rd pooled_feat_2nd_3rd = self._head_to_tail_2nd(roi_pool_feat) cls_score_2nd_3rd = self.RCNN_cls_score_2nd(pooled_feat_2nd_3rd) cls_prob_2nd_3rd = F.softmax(cls_score_2nd_3rd) cls_prob_2nd_3rd = cls_prob_2nd_3rd.view(batch_size, -1, cls_prob_2nd_3rd.size(1)) cls_prob_3rd_avg = (cls_prob_1st_3rd + cls_prob_2nd_3rd + cls_prob_3rd) / 3 else: cls_prob_3rd_avg = cls_prob_3rd return rois, cls_prob_3rd_avg, bbox_pred_3rd, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, RCNN_loss_cls_2nd, RCNN_loss_bbox_2nd, RCNN_loss_cls_3rd, RCNN_loss_bbox_3rd, rois_label
def forward(self, base_feat, im_info, gt_boxes, num_boxes): ''' :param base_feat: shape=(b,1024,w,h)特征提取的输出 :param im_info: shape=(b,3) 3=[W,H,2.2901]最后一个2.2901含义还不太清楚 :param gt_boxes: shape=(b,20,5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据 :param num_boxes:shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt :return: ''' #base_feat.shape=(b,1024,w,h) batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) #shape=(b,512,w,h) # get rpn classification score rpn_cls_score = self.RPN_cls_score( rpn_conv1) #shape=(b,2*9,w,h)也即是每个点9个anchor的前背景概率预测 rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) #shape=(b,2,9*w,h) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) #shape=(b,2,9*w,h) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) #shape=(b,2*9,w,h) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) #shape=(b,4*9,w,h) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' ''' 得到vgg的特征图之后,分别用两个卷积预测每一个anchor的分数(前背景。shape=(b,2*9,w,h)) 以及每一个anchor的回归值(shape=(b,4*9,w,h)) ''' rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) # rois=output.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2] ''' 这里的rois产生的过程: 1:rpn预测每一个anchor的分数以及回归值 2:self.RPN_proposal模块先根据预测的回归值调整初始的anchor 然后取出调整之后的anchor 分数前12000个box,将这12000个box计算nms,,得到保留的box 的索引 然后根据nms得到的索引取出保留的k个box的坐标以及他们的前景分数 在按照分数从k个box(nms之后还保留的box)取出分数前2000个框的坐标 3:最后得到rpn 网络proposal 的结果(2000个roi) ''' self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) #rpn_cls_score.shape=(b,2*9,w,h)也即是每个点9个anchor的前背景概率预测 #gt_boxes: shape=(b,20,5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据 #im_info: shape=(b,3) 3=[W,H,2.2901]最后一个2.2901含义还不太清楚 #num_boxes: shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt ''' rpn_data = outputs=[ labels.shape=(b,1,9*h,w), 所有anchor的标签 1:正样本 0:负样本 -1:ignore bbox_targets.shape=(b,9*4,h,w), 所有anchor的目标回归值 bbox_inside_weights.shape=(b,4*9, h,w), 所有anchor的回归inside权重 bbox_outside_weights.shape=(b,4*9,h,w)所有anchor的回归outside权重 ] self.RPN_anchor_target 函数为每一个anchor(9*w*h个anchor)(是为原始的anchor匹配的标签,而不是经过预测调整之后的achor)匹配到了标签,(1:正样本 0:负样本 -1:ignore),以及他们的回归target 同时已经处理了正负样本过多的问题(将一些标记为-1) ''' # compute classification loss # rpn_cls_score_reshape.shape=(b,2,9*w,h) rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view( batch_size, -1, 2) #shape=(b,9*w*h,2)也即是每个点9个anchor的前背景概率预测 rpn_label = rpn_data[0].view( batch_size, -1) #rpn_label.shape=(b,9*h*w)标记了每个anchor的标签 rpn_keep = Variable( rpn_label.view(-1).ne(-1).nonzero().view(-1)) #shape= ''' rpn_label=[[ 1., 1., 0., 1., -1., 0., -1., 1., 0.], [ 0., 0., 0., -1., 1., 1., -1., 0., 0.]] rpn_keep = [ 0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 13, 14, 16, 17] 也就是将rpn_label先拉平,然后将里面非-1的值的索引取出,也就是将正、负样本的索引取出 ''' #下面将正负样本的预测分数和标签找出来,计算分类损失(前景背景),只计算正负样本的分类损失 rpn_cls_score = torch.index_select(rpn_cls_score.view(-1, 2), 0, rpn_keep) #rpn_cls_score.view(-1, 2).shape=(b*9*h*w,2) 假设rpn_keep.shape=(k)也就是正样本和负样本总数为k,里面存了正负样本的索引 #rpn_cls_score.shape=(k) 将这个batch里面所有图片正负样本的预测分数找出来(正、负样本数量共k个) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) #rpn_label.shape=(k) 将这个batch里面所有的图片的正负样本的标签取出, k:[1,1,0,1,0,0...] rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) '''算出了rpn 的分类loss''' fg_cnt = torch.sum( rpn_label.data.ne(0)) #正样本的数量(统计rpn_label里面非0的元素的个数) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] #rpn_bbox_targets = bbox_targets.shape = (b, 9 * 4, h, w), 所有anchor的目标回归值 #rpn_bbox_inside_weights = bbox_inside_weights.shape = (b, 4 * 9, h, w), 所有anchor的回归inside权重 值为0或1,正样本为1,负样本为0,代表负样本不算回归损失 #rpn_bbox_outside_weights = bbox_outside_weights.shape = (b, 4 * 9, h, w) 值为0或1/k k是正、负样本的总数(一个batch里面的总数) # 所有anchor的回归outside权重 # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) ##rpn_bbox_pred.shape=(b,4*9,w,h)预测的anchor的回归值 #计算正负样本的回顾损失(值计算了回归损失,只计算正样本的回归损失,负样本和ignore的回归损失被置为0),计算每张图片的正样本损失(求和再除以正负样本总数),然后求每个图片的损失均值 self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) #self.rpn_loss_box=(2.36) 是一个值,代表一个batch里面各个图片上的回归损失求的平均 return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, rpn_feature_maps, im_info, gt_boxes, num_boxes): n_feat_maps = len(rpn_feature_maps) rpn_cls_scores = [] rpn_cls_probs = [] rpn_bbox_preds = [] rpn_shapes = [] for i in range(n_feat_maps): feat_map = rpn_feature_maps[i] batch_size = feat_map.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(feat_map), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) rpn_shapes.append( [rpn_cls_score.size()[2], rpn_cls_score.size()[3]]) rpn_cls_scores.append( rpn_cls_score.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) rpn_cls_probs.append( rpn_cls_prob.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2)) rpn_bbox_preds.append( rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 4)) rpn_cls_score_alls = torch.cat(rpn_cls_scores, 1) rpn_cls_prob_alls = torch.cat(rpn_cls_probs, 1) rpn_bbox_pred_alls = torch.cat(rpn_bbox_preds, 1) n_rpn_pred = rpn_cls_score_alls.size(1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal( (rpn_cls_prob_alls.data, rpn_bbox_pred_alls.data, im_info, cfg_key, rpn_shapes)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target( (rpn_cls_score_alls.data, gt_boxes, im_info, num_boxes, rpn_shapes)) # compute classification loss rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score_alls.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_inside_weights.size(1), 4)) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights.unsqueeze(2) \ .expand(batch_size, rpn_bbox_outside_weights.size(1), 4)) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred_alls, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = self.RPN_Conv(base_feat) reg_loss = torch.FloatTensor([0.]).cuda() if self.reg_weight != 0.: reg_loss = (rpn_conv1 ** 2).mean() * self.reg_weight # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # sample loc data bbox_deltas = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous() ori_shape = bbox_deltas.shape bbox_deltas = bbox_deltas.view(batch_size, -1, 8) # sample loc data normal_dist = torch.randn(batch_size, bbox_deltas.size(1), 4).float().cuda() log_sigma_2 = bbox_deltas[:, :, :4] miu = bbox_deltas[:, :, 4:] sigma = torch.exp(log_sigma_2 / 2.) sample_loc_data = normal_dist * sigma * self.sample_sigma + miu rpn_bbox_pred = sample_loc_data.view(batch_size, ori_shape[1], ori_shape[2], ori_shape[3] // 2) rpn_bbox_pred = rpn_bbox_pred.permute(0, 3, 1, 2).contiguous() # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' # scores is a list of foreground_scores after nms rois, scores = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target((rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box, scores, reg_loss
def forward(self, data): #batch_size = im_data.size(0) if self.pathway == 'two_pathway': chan = data[1].shape[2] img_h = data[1].shape[3] img_w = data[1].shape[4] im_info = (data[0][3].view(-1, 3)).to(device="cuda") gt_boxes = (data[0][1].view(-1, cfg.MAX_NUM_GT_BOXES, self.classes + 4)).to(device="cuda") num_boxes = (data[0][2].view(-1)).to(device="cuda") im_data1 = (data[0][0].view(-1, chan, img_h, img_w)).to(device="cuda") batch_size = im_data1.shape[0] im_data2 = (data[1].view(-1, chan, img_h, img_w)).to(device="cuda") # feed image data to base model to obtain base feature map #slow TSM way base_feat1 = self.RCNN_base1(im_data1) #fast non TSM way base_feat2 = self.RCNN_base2(im_data2) #changes base_feat = self.fuselayer(base_feat1, base_feat2) else: chan = data[0].shape[2] height = data[0].shape[3] width = data[0].shape[4] im_info = (data[3].view(-1, 3)).to(device="cuda") gt_boxes = (data[1].view(-1, cfg.MAX_NUM_GT_BOXES, self.classes + 4)).to(device="cuda") num_boxes = (data[2].view(-1)).to(device="cuda") im_data = (data[0].view(-1, chan, height, width)).to(device="cuda") batch_size = im_data.shape[0] base_feat = self.RCNN_base1(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes, val=0) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == "align": pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == "crop": #TODO pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == "pool": pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: if self.loss_type == 'focal' or self.loss_type == 'sigmoid': # select the corresponding columns according to roi labels rois_label = Variable(rois_label.view( -1, self.classes)) #.long()) #modified rois_target = Variable( rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) proposal_num = torch.nonzero(rois_label)[:, 0] class_num = torch.nonzero(rois_label)[:, 1] bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = bbox_pred_view.new(bbox_pred.size(0), 1, 4).zero_() for i in range(proposal_num.shape[0]): dup = torch.nonzero(proposal_num == proposal_num[i]) if (dup.shape[0] > 1): bbox_pred_select[proposal_num[i]] = bbox_pred_view[ proposal_num[i], class_num[dup], :].mean(0) else: bbox_pred_select[proposal_num[i]] = bbox_pred_view[ proposal_num[i], class_num[i], :] bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) if self.loss_type == "sigmoid": cls_prob = torch.sigmoid(cls_score) if self.loss_type == "softmax": cls_prob = torch.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss if self.loss_type == "sigmoid": RCNN_loss_cls = F.binary_cross_entropy_with_logits( cls_score, rois_label) elif self.loss_type == "softmax": rois_label = Variable(rois_label.view(-1, self.classes).long()) rois_label_select = rois_label.new(rois_label.size(0)).zero_() proposal_num = torch.nonzero(rois_label)[:, 0] class_num = (torch.nonzero(rois_label)[:, 1]) rois_label_select[proposal_num] = class_num rois_target = Variable( rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label_select.view(rois_label_select.size(0), 1, 1).expand(rois_label_select.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) RCNN_loss_cls = F.cross_entropy(cls_score, rois_label_select) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) if self.training: return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, \ RCNN_loss_cls, RCNN_loss_bbox, rois_label else: return rois, cls_prob, bbox_pred
def forward(self, im_data, im_info, gt_boxes, num_boxes): ''' :param im_data: shape=(b,3,W,H) :param im_info:shape=(b,3) 3=[W,H,2.2901]最后一个2.2901含义还不太清楚 :param gt_boxes:shape=(b,20,5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据(固定的(b,20,5)维度) 前n为没张图片上的gt,后面20-n全为0 :param num_boxes:shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt :return: ''' batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) #base_feat.shape=(b,1024,w,h) w和h是原图的16分之一 '''到此得到了前面特征提取网络的结果''' # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) ''' #rois=output.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2] #rpn_loss_cls:rpn网络的分类loss 只计算了正样本和负样本的分类loss(根据匹配的标签,先得到正样本和负样本的索引,然后从预测的分数中取出 # 正样本和负样本的分类分数,然后在从匹配的标签中取出正样本和负样本的标签(1,0),然后两者之间算交叉熵loss) #rpn_loss_box=(2.36) 是一个值,代表一个batch里面各个图片上的回归损失求的平均 到此为止,rpn的功能就结束了,产生2000个proposal ,以及算所有anchor的回归损失和分类损失 ''' # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) #rois.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2] #gt_boxes: shape =(b, 20, 5)应该不是每个图片都有20个目标,可能20取的是一个比较大的值,并不是20里面都有gt的数据(固定的(b, 20, 5)维度) #num_boxes:shape=(b) b=[k,j,...]应该是代表这个batch里面第一张图片上有k个gt,第二张图片上有j个gt rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data ''' rois.shape=(b, 128, 5) 记录了这128个roi的box(是从rpn预测出来的2020个proposal里面选出的128个box) 5:第一个数i代表该图片是该batch中的第i张图片 rois_label=labels.shape=(b, 128) 记录了每张图片上128个样本(正、负)的类别target(就是与其iou最大的gt的类别,精确到哪一类,而不是前背景类) rois_target=bbox_targets.shape=(b,128,4) 存了正样本和负样本的回归target(负样本的回归目标在 self._get_bbox_regression_labels_pytorch函数中被设置成0了) rois_inside_ws=box_inside_weights.shape=(b,128,4) 4=[1,1,1,1]或者[0,0,0,0] 正样本的权重是[1,1,1,1] 负样本的权重是[0,0,0,0] rois_outside_ws=bbox_outside_weights.shape=(b,128,4) 4=[1,1,1,1]或者[0,0,0,0] 正样本的权重是[1,1,1,1] 负样本的权重是[0,0,0,0] ''' rois_label = Variable(rois_label.view(-1).long()) #shape=(b*128) 保存了送入rcnn网络的每张图片128个roi的类别标签 rois_target = Variable(rois_target.view(-1, rois_target.size(2)))#shape=(b*128,4)保存了送入rcnn网络的每张图片128个roi的回归target rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2)))#shape=(b*128,4) 保存了每张图片128个roi的内权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0] rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2)))#shape=[b*128,4]保存了每张图片128个roi的外权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0] else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) #train : rois.shape=(b, 128, 5) 记录了这128个roi的box(是从rpn预测出来的2020个proposal里面选出的128个box) 5:第一个数i代表该图片是该batch中的第i张图片 #test : rois.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize),x1,y1,x2,y2] # do roi pooling based on predicted rois ''' 这里从特征图上获得roi box的特征使用的方式是:align(虽然cfg.POOLING_MODE = ’crop‘,但是前面应该是哪里把它改成了align ) ''' if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) # base_feat.shape=(b,1024 ,w,h) w和h是原图的16分之一 base_feat.size()[2:] = [w,h] grid_yx = torch.stack([grid_xy.data[:,:,:,1], grid_xy.data[:,:,:,0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align':#运行align #base_feat.shape=(b,1024 ,w,h) w和h是原图的16分之一 #train : rois.view(-1,5) .shape=(b*128,5) test : rois.view(-1,5) .shape=(b*2000,5) pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) #train : pooled_feat.shape=(b*128,1024,7,7) test : pooled_feat.shape=(b*2000,1024,7,7) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1,5)) # feed pooled features to top model '''这里只看训练时候:train : pooled_feat.shape=(b*128,1024,7,7)''' pooled_feat = self._head_to_tail(pooled_feat)#这里父类(class faster rcnn)调用了子类(class VGG)的head函数 ##pooled_feat.shape=(b*128,2048) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat)#bbox_pred.shape=(b*128,21*4) #看了训练的时候的输出:self.class_agnostic=false 那么就是需要针对每个类别预测回归值 if self.training and not self.class_agnostic: #self.class_agnostic=false # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4)#bbox_pred_view.shape=(b*128,21,4) #rois_labels.shape=(b*128) 保存了送入rcnn网络的每张图片128个roi的类别标签 bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) #bbox_pred_select.shape=(b*128,1,4) 从针对每一类预测的回归值(21,4)中取出该roi匹配的gt的类别(也就是该roi的类别target)的回归值 bbox_pred = bbox_pred_select.squeeze(1) #bbox_pred.shape=(b*128,4) ''' train: bbox_pred.shape=(b*128,4) test: bbox_pred.shape=(b*2000, 21*4) ''' '''到此位置得到的 128个roi经过rcnn预测的回归值(针对target类别的回归值,也就是说,如果这个roi的类别target是汽车,那么就从21*4中取出关于汽车这个类的预测回归值)''' # compute object classification probability ##pooled_feat.shape=(b*128,2048) cls_score = self.RCNN_cls_score(pooled_feat)#shape=(b*128,21) 预测属于每个类的分数 cls_prob = F.softmax(cls_score, 1)#shape=(b*128,21) 得到属于每个类的概率(预测) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) ##bbox_pred.shape=(b*128,4) 预测的每个roi的的回归值(针对他们的类别target的回归值) ##rois_target.shape=(b*128,4)保存了送入rcnn网络的每张图片128个roi的回归target #rois_inside_ws.shape=(b*128,4) 保存了每张图片128个roi的内权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0] #rois_outside_ws.shape=[b*128,4]保存了每张图片128个roi的外权重(见损失公式)正样本的权重[1,1,1,1] f负样本[0,0,0,0] cls_prob = cls_prob.view(batch_size, rois.size(1), -1)#shape(b,128,21)得到属于每个类的概率(预测) 如果是test:shape=(b,2000,21) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1)#shape=(b,128,4)预测的每个roi的的回归值(针对他们的类别target的回归值) 如果是test:shape=(b,2000,4) #test: bbox_pred.shaep=(b,2000,21*4) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, base_feat, gt_twins ): # [(1, 512, 96, 7, 7), (1, 20, 3)] gt_twins前两列代表起止帧,第三列代表标签 batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv1(base_feat), inplace=True) # tx(1) rpn_conv2 = F.relu(self.RPN_Conv2(rpn_conv1), inplace=True) # tx(1) rpn_output_pool = self.RPN_output_pool( rpn_conv2 ) # (1,512,96,1,1) 此处经历了第二个网络:Temporal Proposal Subnet(类似RPN) rpn_output_pool = F.relu(self.Conv_up(rpn_output_pool), inplace=True) # tx(3) (1,512,128,1,1) # get rpn classification score rpn_cls_score = self.RPN_cls_score( rpn_output_pool) # (1,512,96,1,1)->(1,20,96,1,1) 二分类 rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) # (1,20,96,1,1)->(1,2,960,1,1) rpn_cls_prob_reshape = F.softmax( rpn_cls_score_reshape, dim=1) # 按行SoftMax,行和为1(可以理解成把像素归一化,且每行为一个batch) (1,2,960,1,1) rpn_cls_prob = self.reshape( rpn_cls_prob_reshape, self.nc_score_out) #(1,2,960,1,1)->(1,20,96,1,1) # get rpn offsets to the anchor twins rpn_twin_pred = self.RPN_twin_pred( rpn_output_pool) # (1,512,96,1,1)->(1,20,96,1,1) 回归(中心点、长度) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' # rois = self.RPN_proposal((rpn_cls_prob.data, rpn_twin_pred.data, cfg_key)) if self.out_scores: # False rois, rois_score = self.RPN_proposal( (rpn_cls_prob.data, rpn_twin_pred.data, cfg_key)) else: # (回归) rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_twin_pred.data, cfg_key) ) #(1,2000,3)[其中(1,<960,3)<960的部分是前景,第一列全0存放未来的21类标签,后两列是可能的前景的起止帧;(960,2000)的部分全0,可能代表背景] self.rpn_loss_cls = 0 self.rpn_loss_twin = 0 self.rpn_loss_mask = 0 self.rpn_label = None # generating training labels and build the rpn loss if self.training: assert gt_twins is not None # rpn_data = [label_targets, twin_targets, twin_inside_weights, twin_outside_weights] # label_targets: (batch_size, 1, A * length, height, width) # twin_targets: (batch_size, A*2, length, height, width), the same as twin_inside_weights and twin_outside_weights # (二分类) # (1,20,96,1,1)二分类, (1, 20, 3) gt_twins前两列代表真实起止帧,第三列代表标签 rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_twins) ) #rpn_data 0:(1, 1, 960, 1, 1) 1:(1, 20, 96, 1, 1) 2:(1,20,96,1,1) 3:(1,20,96,1,1) 2分类 多分类(好像是窗口回归) 窗口回归(内窗口) 窗口回归(外窗口) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 4, 1).contiguous().view(batch_size, -1, 2) # (1,960,2) 二分类,后两列可能是前景和背景概率分数 self.rpn_label = rpn_data[0].view( batch_size, -1) # (1, 960) 前876个数为处理后的labels(包含1,0,-1),后面几个数填充-1 rpn_keep = Variable( self.rpn_label.view(-1).ne(-1).nonzero().view( -1)) #(256)非0元素所在所有列(索引),即所有1和-1的索引(好像是1和0)(与前景有关) rpn_cls_score = torch.index_select( rpn_cls_score.view(-1, 2), 0, rpn_keep ) #(256,2)256个1和-1(好像是1和0)(前景和无关项(可能的前景))对应的前景和背景概率(以rpn_keep(256)的值为索引,取出256个rpn_cls_score的第0维(行)) self.rpn_label = torch.index_select( self.rpn_label.view(-1), 0, rpn_keep.data ) #(256)256个1和-1(好像是1和0)(前景和无关项(可能的前景))对应的labels(相当于去除了背景0) self.rpn_label = Variable( self.rpn_label.long()) #(256)256个1和0对应的labels(相当于去除了-1) # (256,2)对应帧的1和0对应的前背景概率(预测值),(256)对应帧的1和0(真实值) self.rpn_loss_cls = F.cross_entropy( rpn_cls_score, self.rpn_label ) #(二分类损失)交叉熵函数,因为函数公式中有个-log,当预测越接近1(预测正确),函数值越小,所以可看成损失,即损失(函数值)越小,预测得越正确 fg_cnt = torch.sum(self.rpn_label.data.ne(0)) # 真实窗口起止帧, 列表中前景部分为1(少量), 列表基本接近于0(背景) rpn_twin_targets, rpn_twin_inside_weights, rpn_twin_outside_weights = rpn_data[ 1:] # (1, 20, 96, 1, 1)窗口回归 # compute twin regression loss rpn_twin_inside_weights = Variable(rpn_twin_inside_weights) rpn_twin_outside_weights = Variable(rpn_twin_outside_weights) rpn_twin_targets = Variable(rpn_twin_targets) # 预测窗口起止帧, 真实窗口起止帧, 真实窗口 前景填充为1(少量), 真实窗口 全是背景(0) self.rpn_loss_twin = _smooth_l1_loss( rpn_twin_pred, rpn_twin_targets, rpn_twin_inside_weights, rpn_twin_outside_weights, sigma=3, dim=[1, 2, 3, 4]) #(窗口回归损失)有对应公式,同样损失值越小代表预测越准 if self.out_scores: # False return rois, rois_score, rpn_cls_prob, rpn_twin_pred, self.rpn_loss_cls, self.rpn_loss_twin, self.rpn_label, self.rpn_loss_mask else: # (1,2000,3),(1,20,96,1,1),(1,20,96,1,1), (1), (1), (256), (1) =0 return rois, rpn_cls_prob, rpn_twin_pred, self.rpn_loss_cls, self.rpn_loss_twin, self.rpn_label, self.rpn_loss_mask
def forward(self, base_feat, im_info, gt_boxes, num_boxes): # shape of base_feat: batch_size, channel ,h , w batch_size = base_feat.size(0) #进行一次卷积 rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # shape : batch_size, 512 ,h , w rpn_cls_score = self.RPN_cls_score( rpn_conv1) # shape : batch_size, 18(9*2) ,h , w rpn_cls_score_reshape = self.reshape( rpn_cls_score, 2) # shape : batch_size, 2 ,h*9 , w rpn_cls_prob_reshape = F.softmax( rpn_cls_score_reshape, 1) # 在1维度上进行比较 # shape : batch_size, 2 ,h*9 , w rpn_cls_prob = self.reshape( rpn_cls_prob_reshape, self.nc_score_out) # shape : batch_size, 9*2 ,h , w # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred( rpn_conv1) # shape : batch_size, 9*4 ,h , w # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key )) # shape batch_size, post_nms_topN , 5 ; 0 of 5 is batch_idx self.rpn_loss_cls = 0 self.rpn_loss_box = 0 if self.training: assert gt_boxes is not None rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view(-1)) rpn_cls_score = torch.index_select(rpn_cls_score.view(-1, 2), 0, rpn_keep) rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep.data) rpn_label = Variable(rpn_label.long()) self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): # shape: im_data [1,c,w,h] im_info[1,3] gt_boxes[1,20,5] num_boxes[1] batch_size = im_data.size(0) # im_data 为原始图像blob[1,3,850,600] im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map to RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) # if it is training phase, then use ground truth bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # 测试阶段rois格式为[1,300,5]维度为5,第一列全是0, # 并不表示roi的标签,仅仅是batch的index标识。gt_boxes的维度是(x,5),x是object的数量。 # do roi pooling based on predicted rois # POOLING_MODE = align pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) #feed pooled feature to top model pooled_feat = self.head_to_tail(pooled_feat) # compute bbox offset ,roi池化后提取的roi特征计算边框预测值 bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according ti roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather(bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) # 测试阶段cls_score为[300, 21], bbox_pred为[300, 84] RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) # 测试阶段[1, 300, 21] bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) # 测试阶段[1, 300, 84] return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, base_feat, im_info, gt_boxes, num_boxes): batch_size = base_feat.size(0) # return feature map after convrelu layer rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) #get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) # 为了便于softmax分类,[1,18,36,57]到[1,2,36*9,57],宽度为9999的堆叠得到324宽度,57高度 # 2深度,对所有宽高对应的2个值求softmax,再reshape回去,18对应9个anchor连续前景得分+连续背景得分 rpn_cls_score_reshape = self.reshape(rpn_cls_score, 2) rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, 1) rpn_cls_prob = self.reshape(rpn_cls_prob_reshape, self.nc_score_out) # get rpn offsets to the anchor boxes rpn_bbox_pred = self.RPN_bbox_pred(rpn_conv1) # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' # _ProposalLayer 传入预测的rpn_box得分和坐标修正值,图像im_info, # 在ProposalLayer中特征图每个位置生成anchor,对anchor进行修正,排序,裁剪等处理。 # 只返回rois,并不包含任何loss,loss只在训练过程中使用,通过_AnchorTargetLayer计算 rois = self.RPN_proposal( (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 # generating training labels and bulid the rpn loss if self.training: assert gt_boxes is not None # _AnchorTargetLayer传入rpn_box得分预测,gt_box以及im_info,num_boxes等信息。_AnchorTargetLayer中同样在特征图每个位置生成anchor, rpn_data = self.RPN_anchor_target( (rpn_cls_score.data, gt_boxes, im_info, num_boxes)) # compute classification loss [1, 2, 36*9, 57]->[1, 36*9, 57, 2]->[1, 36*9*57, 2] rpn_cls_score = rpn_cls_score_reshape.permute( 0, 2, 3, 1).contiguous().view(batch_size, -1, 2) rpn_label = rpn_data[0].view(batch_size, -1) # [1, 17649] # torch.ne(input, other, out=Tensor) -> Tensor 如果tensor != other 为True,返回1 rpn_keep = Variable(rpn_label.view(-1).ne(-1).nonzero().view( -1)) # 通过ne去掉-1,返回非0索引[17649],索引中包含所有正负样本的索引 rpn_cls_score = torch.index_select(rpn_cls_score.view( -1, 2), 0, rpn_keep) # 根据索引选出在[17649,2]的0维度选择score rpn_label = torch.index_select(rpn_label.view(-1), 0, rpn_keep) # 同样根据索引选择label rpn_label = Variable(rpn_label.long()) # 损失函数,rpn只负责预测anchor是前景还是背景,因此只有二分类 # [b*9*w*h, 2] 和[b*9*w*h] 由于使用的交叉熵损失函数,里面会计算Softmax。 self.rpn_loss_cls = F.cross_entropy(rpn_cls_score, rpn_label) fg_cnt = torch.sum(rpn_label.data.ne(0)) rpn_bbox_targets, rpn_bbox_indide_weights, rpn_bbox_outside_weights = rpn_data[ 1:] # 取anchor_target_layer的返回值 # compute bbox regression loss rpn_bbox_indide_weights = Variable(rpn_bbox_indide_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_indide_weights, rpn_bbox_outside_weights, sigma=3, dim=[1, 2, 3]) return rois, self.rpn_loss_cls, self.rpn_loss_box
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def forward(self, video_data, gt_twins): batch_size = video_data.size(0) # print(batch_size) gt_twins = gt_twins.data # prepare data video_data = self.prepare_data(video_data) # 这个video_data有变化? (1, 3, 768, 112, 112) # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(video_data) # 经过c3d的前五层得到特征图(得到512 x L/8 x H/16 x W/16大小的特征图) (1,512,96,7,7) # feed base feature map tp RPN to obtain rois # (1,2000,3),(1), (1) (1,512,96,7,7), (1,20,3)gt_twins前两列代表起止帧,第三列代表标签 rois, _, _, rpn_loss_cls, rpn_loss_twin, _, _ = self.RCNN_rpn(base_feat, gt_twins) # 经过rpn网络得到rois # rois[其中(1,<960,3)<960的部分是前景,第1列全0存21类标签,后两列是前景的起止帧;(960,2000)的部分全0,可能代表背景] # if it is training phase, then use ground truth twins for refining if self.training: # 走这条(暂时理解成对rois的一些限制) roi_data = self.RCNN_proposal_target(rois, gt_twins) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data # (1,128,3),(1,128),(1,128,2), (1,128,2), (1,128,2) rois_label = Variable(rois_label.view(-1).long()) # (128) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) # (128,2) rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) # (128,2) rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) # (128,2) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_twin = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'pool': # True pooled_feat = self.RCNN_roi_temporal_pool(base_feat, rois.view(-1, 3)) # (128, 512, 4, 2, 2) if cfg.USE_ATTENTION: # False pooled_feat = self.RCNN_attention(pooled_feat) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # 分类网络 (128,4096) # compute twin offset, twin_pred will be (128, 402) twin_pred = self.RCNN_twin_pred(pooled_feat) # nn.Linear(4096, 2 * 21) 实际是(128,42) if self.training: # 走这条 # select the corresponding columns according to roi labels, twin_pred will be (128, 2) twin_pred_view = twin_pred.view(twin_pred.size(0), int(twin_pred.size(1) / 2), 2) twin_pred_select = torch.gather(twin_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 2)) twin_pred = twin_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) # nn.Linear(4096, 21) cls_prob = F.softmax(cls_score, dim=1) # 多分类 if DEBUG: # False print("tdcnn.py--base_feat.shape {}".format(base_feat.shape)) print("tdcnn.py--rois.shape {}".format(rois.shape)) print("tdcnn.py--tdcnn_tail.shape {}".format(pooled_feat.shape)) print("tdcnn.py--cls_score.shape {}".format(cls_score.shape)) print("tdcnn.py--twin_pred.shape {}".format(twin_pred.shape)) RCNN_loss_cls = 0 RCNN_loss_twin = 0 if self.training: # 走这条 # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # (多分类损失) # bounding box regression L1 loss RCNN_loss_twin = _smooth_l1_loss(twin_pred, rois_target, rois_inside_ws, rois_outside_ws) # (回归损失) # RuntimeError caused by mGPUs and higher pytorch version: https://github.com/jwyang/faster-rcnn.pytorch/issues/226 rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_twin = torch.unsqueeze(rpn_loss_twin, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_twin = torch.unsqueeze(RCNN_loss_twin, 0) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) # 分类预测? twin_pred = twin_pred.view(batch_size, rois.size(1), -1) # 回归预测? if self.training: # 走这条 return rois, cls_prob, twin_pred, rpn_loss_cls, rpn_loss_twin, RCNN_loss_cls, RCNN_loss_twin, rois_label else: return rois, cls_prob, twin_pred
def _forward_train_joint(self, frame_1_box, frame_2, frame_2_box, num_box): """ This function use the rcnn_base_mv branch and the rcnn_base_residual. The following data are all Variables. We are trying to predict the offset between the boxes in frame 1 and frame 2 (i.e. frame_1_box and frame_2_box). We crop the feature using PSRoIPooling based frame_1_box to predict the offsets. :param frame_1_box: 3D tensor, bs x num_box x 6, each row is (x1, y1, x2, y2, class_id, target_id) :param frame_2: 4D tensor, bs x 5 x h x w, the 0:2 channel is motion vector, 2:5 channel is the residual :param frame_2_box: 2D tensor, bs x num_box x 6, each row is (x1, y1, x2, y2, class_id, target_id) :param num_box: 1D tensor, [bs], the number of gt boxes in differernt frames. Noted that the boxes in two frames that in one pair are the same. """ # we set a trace, if the weight of this layer has nan, then it will pause: # we use the fact that nan != nan is true feature_add_conv_weight = self.feature_add_conv.state_dict()['weight'] if (feature_add_conv_weight != feature_add_conv_weight).sum() > 0: print('\n there is nan in the weight of one layer\n') pdb.set_trace() batch_size = frame_2.size()[0] # get the base features feat_mv = self.RCNN_base_mv(frame_2[:, 0:2, :, :].contiguous()) feat_residual = self.RCNN_base_residual(frame_2[:, 2:5].contiguous()) # concate the features base_feat = torch.cat((feat_mv, feat_residual), dim=1) base_feat = self.feature_add_conv(base_feat) base_feat_loc = self.RCNN_bbox_base(base_feat) # PSRoIPooling frame_1_box_tmp = frame_1_box.data.contiguous( ) # [bs, num_box, 6], each row is [x1, y1, x2, y2, class_id, target_id] frame_2_box_tmp = frame_2_box.data.contiguous() # [bs, num_box, 6] # (1) generate rois rois_1 = frame_1_box_tmp.new( batch_size, frame_1_box_tmp.size()[1], 5).zero_() # each row is [batch_index, x1, y1, x2, y2] rois_1[:, :, 1:5] = frame_1_box_tmp[:, :, 0:4].clone() for bs_idx in range(batch_size): rois_1[bs_idx, :, 0] = bs_idx rois_1 = Variable(rois_1) # (2) pooling to get the offset pooled_feat_loc = self.RCNN_psroi_pool_loc( base_feat_loc, rois_1.view(-1, 5)) # [num_box, 4, pooled_size, pooled_size] bbox_pred = self.pooling(pooled_feat_loc) # [num_box, 4, 1, 1] bbox_pred = bbox_pred.squeeze() # [num_box, 4] bbox_pred = bbox_pred.view(batch_size, -1, 4) # compute the box regression target rois_1 = frame_1_box_tmp[:, :, 0:4].clone().contiguous() rois_2 = frame_2_box_tmp[:, :, 0:4].clone().contiguous() regression_targets = self._compute_bbox_targets(rois_1, rois_2) # compute the inside weights and outside weights num_box_1_tmp = num_box.data.int() # if (num_box_1_tmp == 0).sum() > 0: # a = 1 inside_weight = regression_targets.new(batch_size, regression_targets.size(1), 4).zero_() outside_weight = regression_targets.new(batch_size, regression_targets.size(1), 4).zero_() for bs_idx in range(batch_size): if num_box_1_tmp[bs_idx] > 0: inside_weight[bs_idx, 0:num_box_1_tmp[bs_idx], :] = 1 outside_weight[ bs_idx, 0:num_box_1_tmp[bs_idx], :] = 1.0 / num_box_1_tmp[bs_idx] # get the loss regression_targets = Variable(regression_targets) inside_weight = Variable(inside_weight) outside_weight = Variable(outside_weight) loss_bbox = _smooth_l1_loss(bbox_pred, regression_targets, inside_weight, outside_weight, dim=[2, 1]) # # # outside_weight = outside_weight.data # outside_weight[outside_weight > 0] = 1 # outside_weight = Variable(outside_weight) # bbox_pred = bbox_pred.view(-1, 4) # regression_targets = regression_targets.view(-1, 4) # inside_weight = inside_weight.view(-1, 4) # outside_weight = outside_weight.view(-1, 4) # loss_bbox_1 = _smooth_l1_loss(bbox_pred, regression_targets, inside_weight, outside_weight) return bbox_pred, loss_bbox
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox, fg_scores, rpn_reg_loss = \ self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) rpn_prior_loss = torch.FloatTensor([0.]).cuda() # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) if self.rpn_prior_weight != 0.: for i in range(batch_size): gt_num = num_boxes[i].detach().cpu().item() score = fg_scores[i] score_sum = score.sum().detach().cpu().item() score = score / score_sum log_score = score * torch.log(score + 1e-6) # p * log(p) rpn_prior_loss += (-1. * log_score.sum() / float(gt_num)) rpn_prior_loss /= batch_size rpn_prior_loss *= self.rpn_prior_weight else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = torch.FloatTensor([0.]).cuda() rpn_loss_bbox = torch.FloatTensor([0.]).cuda() rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) head_reg_loss = torch.FloatTensor([0.]).cuda() if self.training and self.head_reg_weight != 0.: head_reg_loss = (pooled_feat**2).mean() * self.head_reg_weight # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) # sample loc data normal_dist = torch.randn(bbox_pred.size(0), 4).float().cuda() log_sigma_2 = bbox_pred[:, :4] miu = bbox_pred[:, 4:] sigma = torch.exp(log_sigma_2 / 2.) sample_loc_data = normal_dist * sigma * self.sample_sigma + miu bbox_pred = sample_loc_data if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = torch.FloatTensor([0.]).cuda() RCNN_loss_bbox = torch.FloatTensor([0.]).cuda() if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) head_prior_loss = torch.FloatTensor([0.]).cuda() if self.training and self.head_prior_weight != 0.: scores = cls_prob.data # [batch, num_rois, classes] scores_gradient = cls_prob # [batch, num_rois, classes] boxes = rois.data[:, :, 1:5] # [batch, num_rois, 4] if cfg.TRAIN.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data # [batch, num_rois, 4] if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(batch_size, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(batch_size, -1, 4 * len(self.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, batch_size) pred_boxes = clip_boxes(pred_boxes, im_info.data, batch_size) else: # Simply repeat the boxes, once for each class print("no use bbox head in IB") pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_info[:, 2].data[:, None, None] # [batch, num_rois, 4] loss_count = 0. gt_classes = gt_boxes[:, :, -1].data # [batch, num(0 pad to 20)] for i in range(batch_size): for j in range(1, len(self.classes)): # skip background class if not (gt_classes[i] == j).any(): # no such class in gt continue # there are gt for this class inds = torch.nonzero( scores[i, :, j] > self.nms_threshold).view(-1) if inds.numel() == 0: continue cls_scores = scores[i, :, j][inds] # [num] cls_scores_gradient = scores_gradient[i, :, j][inds] _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[i, inds, :] # [num, 4] else: cls_boxes = pred_boxes[i, inds][:, j * 4:(j + 1) * 4] cls_scores_gradient = cls_scores_gradient[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) score = cls_scores_gradient[keep.view( -1).long()] # [num_keep] gt_num = (gt_classes[i] == j).sum().detach().cpu().item() if score.size(0) <= gt_num: continue score_sum = score.sum().detach().cpu().item() score = score / score_sum log_score = score * torch.log(score + 1e-6) head_prior_loss += (-1. * log_score.sum() / float(gt_num)) loss_count += 1. head_prior_loss /= loss_count head_prior_loss *= self.head_prior_weight return rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, \ rpn_prior_loss, rpn_reg_loss, head_prior_loss, head_reg_loss