def forward(self, input): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs scores = input[0][:, self._num_anchors:, :, :] bbox_deltas = input[1] im_info = input[2] cfg_key = input[3] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE batch_size = bbox_deltas.size(0) feat_height, feat_width = scores.size(2), scores.size(3) shift_x = np.arange(0, feat_width) * self._feat_stride shift_y = np.arange(0, feat_height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = torch.from_numpy( np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(scores).float() A = self._num_anchors K = shifts.size(0) self._anchors = self._anchors.type_as(scores) # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous() anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() bbox_deltas = bbox_deltas.view(batch_size, -1, 4) # Same story for the scores: scores = scores.permute(0, 2, 3, 1).contiguous() scores = scores.view(batch_size, -1) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # proposals = clip_boxes_batch(proposals, im_info, batch_size) # assign the score to 0 if it's non keep. # keep = self._filter_boxes(proposals, min_size * im_info[:, 2]) # trim keep index to make it euqal over batch # keep_idx = torch.cat(tuple(keep_idx), 0) # scores_keep = scores.view(-1)[keep_idx].view(batch_size, trim_size) # proposals_keep = proposals.view(-1, 4)[keep_idx, :].contiguous().view(batch_size, trim_size, 4) # _, order = torch.sort(scores_keep, 1, True) scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output
def loop(): args = parse_args() print('Called with args:') print(args) if torch.cuda.is_available() and not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") if args.dataset == "pascal_voc": args.imdb_name = "voc_2007_test" args.imdbval_name = "voc_2007_test" args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] elif args.dataset == "pascal_voc_0712": args.imdb_name = "voc_2007_trainval+voc_2012_trainval" args.imdbval_name = "voc_2007_test" args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] elif args.dataset == "coco": args.imdb_name = "coco_2014_train+coco_2014_valminusminival" args.imdbval_name = "coco_2014_minival" args.set_cfgs = ['ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] elif args.dataset == "imagenet": args.imdb_name = "imagenet_train" args.imdbval_name = "imagenet_val" args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] elif args.dataset == "vg": args.imdb_name = "vg_150-50-50_minitrain" args.imdbval_name = "vg_150-50-50_minival" args.set_cfgs = ['ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] args.cfg_file = "cfgs/{}.yml".format(args.net) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) print('Using config:') pprint.pprint(cfg) np.random.seed(cfg.RNG_SEED) cfg.TRAIN.USE_FLIPPED = False imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdbval_name, False) imdb.competition_mode(on=True) print('{:d} roidb entries'.format(len(roidb))) # initilize the network here. if args.net == 'vgg16': fpn = vgg16(imdb.classes, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res101': fpn = resnet(imdb.classes, 101, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res50': fpn = resnet(imdb.classes, 50, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res152': fpn = resnet(imdb.classes, 152, pretrained=True, class_agnostic=args.class_agnostic) else: print("network is not defined") pdb.set_trace() fpn.create_architecture() print('load model successfully!') im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) # ship to cuda if args.cuda: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() # make variable im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) if args.cuda: cfg.CUDA = True if args.cuda: fpn.cuda() start = time.time() max_per_image = 100 vis =True #args.vis if vis: thresh = 0.0 else: thresh = 0.0 save_name = 'faster_rcnn_10' num_images = len(imdb.image_index) all_boxes = [[[] for _ in range(num_images)] for _ in range(imdb.num_classes)] output_dir = get_output_dir(imdb, save_name) for h in range(200): dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ imdb.num_classes, training=False, normalize=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) data_iter = iter(dataloader) _t = {'im_detect': time.time(), 'misc': time.time()} det_file = os.path.join(output_dir, 'detections.pkl') input_dir = args.load_dir + "/" + args.net + "/" + args.dataset if not os.path.exists(input_dir): raise Exception('There is no input directory for loading network from ' + input_dir) load_name = os.path.join(input_dir, 'fpn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) print("load checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) fpn.load_state_dict(checkpoint['model']) if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] fpn.eval() empty_array = np.transpose(np.array([[], [], [], [], []]), (1, 0)) for i in range(num_images): data = data_iter.next() im_data.data.resize_(data[0].size()).copy_(data[0]) im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) num_boxes.data.resize_(data[3].size()).copy_(data[3]) det_tic = time.time() rois, cls_prob, bbox_pred, \ _, _, _, _, _ = fpn(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data # 1*300*10 boxes = rois.data[:, :, 1:5] # 1*300*4 if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data # 1*300*40 if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = boxes pred_boxes /= data[1][0][2].cuda() scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im = cv2.imread(imdb.image_path_at(i)) im2show = np.copy(im) for j in range(1, imdb.num_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS,~args.cuda) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, imdb.classes[j], cls_dets.cpu().numpy(), 0.3) all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = empty_array # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack([all_boxes[j][i][:, -1] for j in range(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in range(1, imdb.num_classes): keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] misc_toc = time.time() nms_time = misc_toc - misc_tic sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ .format(i + 1, num_images, detect_time, nms_time)) sys.stdout.flush() if vis: cv2.imwrite('images/result%d_%d.png' %(args.checkepoch,i), im2show) #pdb.set_trace() # cv2.imshow('test', im2show) # cv2.waitKey(0) del data del pred_boxes del scores torch.cuda.empty_cache() with open(det_file, 'wb') as f: cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL) print('Evaluating detections') aps, clss = imdb.evaluate_detections(all_boxes, output_dir) #print(aps) with open("result.txt", 'a+') as f: # print(args.checkepoch) lp="" cc=0 for b in clss: if cc!=len(clss)-1: lp=lp+"'"+str(b) + ":" + str(aps[cc])+"'," else: lp = lp + "'" + str(b) + ":" + str(aps[cc])+"'" cc=cc+1 sp = "["+lp+ "] ls:" + str(args.checksession) + "_" + str(args.checkepoch) # print(sp) f.write(sp + "\n") end = time.time() print("test time: %0.4fs" % (end - start)) args.checkepoch = args.checkepoch + 1 del data_iter del dataset del dataloader torch.cuda.empty_cache() #torch.empty_cache() gc.collect()
def test_net(tdcnn_demo, dataloader, args): start = time.time() # TODO: Add restriction for max_per_video max_per_video = 0 if args.vis: thresh = 0.05 else: thresh = 0.005 all_twins = [[[] for _ in xrange(args.num_videos)] for _ in xrange(args.num_classes)] _t = {'im_detect': time.time(), 'misc': time.time()} tdcnn_demo.eval() empty_array = np.transpose(np.array([[], [], []]), (1, 0)) data_tic = time.time() for i, (video_data, gt_twins, num_gt, video_info) in enumerate(dataloader): video_data = video_data.cuda() gt_twins = gt_twins.cuda() batch_size = video_data.shape[0] data_toc = time.time() data_time = data_toc - data_tic det_tic = time.time() rois, cls_prob, twin_pred = tdcnn_demo(video_data, gt_twins) # rpn_loss_cls, rpn_loss_twin, \ # RCNN_loss_cls, RCNN_loss_twin, rois_label = tdcnn_demo(video_data, gt_twins) scores_all = cls_prob.data twins = rois.data[:, :, 1:3] if cfg.TEST.TWIN_REG: # True # Apply bounding-twin regression deltas twin_deltas = twin_pred.data if cfg.TRAIN.TWIN_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev twin_deltas = twin_deltas.view(-1, 2) * torch.FloatTensor(cfg.TRAIN.TWIN_NORMALIZE_STDS).type_as(twin_deltas) \ + torch.FloatTensor(cfg.TRAIN.TWIN_NORMALIZE_MEANS).type_as(twin_deltas) twin_deltas = twin_deltas.view(batch_size, -1, 2 * args.num_classes) pred_twins_all = twin_transform_inv(twins, twin_deltas, batch_size) pred_twins_all = clip_twins(pred_twins_all, cfg.TRAIN.LENGTH[0], batch_size) else: # Simply repeat the twins, once for each class pred_twins_all = np.tile(twins, (1, scores_all.shape[1])) det_toc = time.time() detect_time = det_toc - det_tic for b in range(batch_size): misc_tic = time.time() print(video_info[b]) scores = scores_all[b] #scores.squeeze() pred_twins = pred_twins_all[b] #.squeeze() # skip j = 0, because it's the background class for j in xrange(1, args.num_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) cls_twins = pred_twins[inds][:, j * 2:(j + 1) * 2] cls_dets = torch.cat((cls_twins, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_twins, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS) if (len(keep) > 0): cls_dets = cls_dets[keep.view(-1).long()] print("activity: ", j) print(cls_dets.cpu().numpy()) all_twins[j][i * batch_size + b] = cls_dets.cpu().numpy() else: all_twins[j][i * batch_size + b] = empty_array # Limit to max_per_video detections *over all classes* if max_per_video > 0: video_scores = np.hstack([ all_twins[j][i * batch_size + b][:, -1] for j in xrange(1, args.num_classes) ]) if len(video_scores) > max_per_video: video_thresh = np.sort(video_scores)[-max_per_video] for j in xrange(1, args.num_classes): keep = np.where( all_twins[j][i * batch_size + b][:, -1] >= video_thresh)[0] all_twins[j][i * batch_size + b] = all_twins[j][i * batch_size + b][keep, :] misc_toc = time.time() nms_time = misc_toc - misc_tic print ('im_detect: {:d}/{:d} {:.3f}s {:.3f}s {:.3f}s' \ .format(i*batch_size+b+1, args.num_videos, data_time/batch_size, detect_time/batch_size, nms_time)) if args.vis: pass data_tic = time.time() end = time.time() print("test time: %0.4fs" % (end - start))
for j in xrange(1, len(pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, pascal_classes[j], cls_dets.cpu().numpy(), 0.5) misc_toc = time.time() nms_time = misc_toc - misc_tic if webcam_num == -1: sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ .format(num_images + 1, len(imglist), detect_time, nms_time)) sys.stdout.flush() if vis and webcam_num == -1:
def interest(im2show, data, fpn, all_position, i, all_boxes, r_w, r_h, rat_w, rat_h): for key, value in all_position.items(): x = int(((value[2] - value[0]) / 2 + value[0]) * rat_w) y = int(((value[3] - value[1]) / 2 + value[1]) * rat_h) data_tem = data[0][:, :, y - int(HIGHT / 2):y + int(HIGHT / 2), x - int(WIDTH / 2):x + int(WIDTH / 2)] #print(data[0].shape()) w = len(data_tem[0][0][0]) h = len(data_tem[0][0]) print("INER", w, h) if w <= 0 or h <= 0: return None if args.cuda: data_tem1 = torch.from_numpy(np.array([[h, w, w / h]])).float().cuda() data_tem2 = torch.from_numpy(np.array([[1, 1, 1, 1, 1]])).float().cuda() data_tem3 = torch.from_numpy(np.array([1])).long().cuda() else: data_tem1 = torch.from_numpy(np.array([[h, w, w / h]])).float() data_tem2 = torch.from_numpy(np.array([[1, 1, 1, 1, 1]])).float() data_tem3 = torch.from_numpy(np.array([1])).long() im_data.data.resize_(data_tem.size()).copy_(data_tem) im_info.data.resize_(data_tem1.size()).copy_(data_tem1) gt_boxes.data.resize_(data_tem2.size()).copy_(data_tem2) num_boxes.data.resize_(data_tem3.size()).copy_(data_tem3) rois, cls_prob, bbox_pred, \ _, _, _, _, _ = fpn(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] # 忽略掉前面一个数值,后面都是BOX if cfg.TEST.BBOX_REG: box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: if args.class_agnostic: if args.cuda: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if args.cuda: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: pred_boxes = boxes pred_boxes /= data_tem1[0][2] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() for j in range(1, imdb.num_classes): # 遍历每一类 inds = torch.nonzero(scores[:, j] > 0.6).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) # 排序分数列表降低序 if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] for c in range(len(cls_boxes)): # 调整,获取小图片在大图片里面的坐标 cls_boxes[c][0] = (cls_boxes[c][0] + x - int(WIDTH / 2)) / rat_w cls_boxes[c][1] = (cls_boxes[c][1] + y - int(HIGHT / 2)) / rat_h cls_boxes[c][2] = (cls_boxes[c][2] + x - int(WIDTH / 2)) / rat_w cls_boxes[c][3] = (cls_boxes[c][3] + y - int(HIGHT / 2)) / rat_h cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # 追加 cls_dets = cls_dets[order] # 将torch.tensor 按给定的训练排序 keep = nms(cls_dets, cfg.TEST.NMS, args.cuda) # 非极大值抑制,获取要保留的 cls_dets = cls_dets[keep.view(-1).long()] # 从tensor里面拿出对应的数据结构 if all_boxes[j][i] == []: all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = np.vstack( (all_boxes[j][i], cls_dets.cpu().numpy()))
def generate_proposal(self, rpn_cls_probs, anchors, rpn_bbox_preds, im_info): # TODO create a new Function """ Args: rpn_cls_probs: FloatTensor,shape(N,2*num_anchors,H,W) rpn_bbox_preds: FloatTensor,shape(N,num_anchors*4,H,W) anchors: FloatTensor,shape(N,4,H,W) Returns: proposals_batch: FloatTensor, shape(N,post_nms_topN,4) fg_probs_batch: FloatTensor, shape(N,post_nms_topN) """ # assert len( # rpn_bbox_preds) == 1, 'just one feature maps is supported now' # rpn_bbox_preds = rpn_bbox_preds[0] anchors = anchors[0] # do not backward anchors = anchors rpn_cls_probs = rpn_cls_probs.detach() rpn_bbox_preds = rpn_bbox_preds.detach() batch_size = rpn_bbox_preds.shape[0] rpn_bbox_preds = rpn_bbox_preds.permute(0, 2, 3, 1).contiguous() # shape(N,H*W*num_anchors,4) rpn_bbox_preds = rpn_bbox_preds.view(batch_size, -1, 4) # apply deltas to anchors to decode # loop here due to many features maps # proposals = [] # for rpn_bbox_preds_single_map, anchors_single_map in zip( # rpn_bbox_preds, anchors): # proposals.append( # self.bbox_coder.decode(rpn_bbox_preds_single_map, # anchors_single_map)) # proposals = torch.cat(proposals, dim=1) proposals = self.bbox_coder.decode_batch(rpn_bbox_preds, anchors) # filer and clip proposals = box_ops.clip_boxes(proposals, im_info) # fg prob fg_probs = rpn_cls_probs[:, self.num_anchors:, :, :] fg_probs = fg_probs.permute(0, 2, 3, 1).contiguous().view(batch_size, -1) # sort fg _, fg_probs_order = torch.sort(fg_probs, dim=1, descending=True) # fg_probs_batch = torch.zeros(batch_size, # self.post_nms_topN).type_as(rpn_cls_probs) proposals_batch = torch.zeros(batch_size, self.post_nms_topN, 4).type_as(rpn_bbox_preds) proposals_order = torch.zeros( batch_size, self.post_nms_topN).fill_(-1).type_as(fg_probs_order) for i in range(batch_size): proposals_single = proposals[i] fg_probs_single = fg_probs[i] fg_order_single = fg_probs_order[i] # pre nms if self.pre_nms_topN > 0: fg_order_single = fg_order_single[:self.pre_nms_topN] proposals_single = proposals_single[fg_order_single] fg_probs_single = fg_probs_single[fg_order_single] # nms keep_idx_i = nms( torch.cat((proposals_single, fg_probs_single.unsqueeze(1)), 1), self.nms_thresh) keep_idx_i = keep_idx_i.long().view(-1) # post nms if self.post_nms_topN > 0: keep_idx_i = keep_idx_i[:self.post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] fg_probs_single = fg_probs_single[keep_idx_i] fg_order_single = fg_order_single[keep_idx_i] # padding 0 at the end. num_proposal = keep_idx_i.numel() proposals_batch[i, :num_proposal, :] = proposals_single # fg_probs_batch[i, :num_proposal] = fg_probs_single proposals_order[i, :num_proposal] = fg_order_single return proposals_batch, proposals_order
def forward(self, input): #input=(rpn_cls_prob.data, rpn_bbox_pred.data,im_info, cfg_key) #rpn_cls_prob=(b,2*9,w,h) rpn_bbox_pred=(b,4*9,w,h) im_info=(b,3)=[[w,h,3],[..]](这里wh是原图的尺寸) cfg_key=‘train’or‘test’ # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs ''' :param input: :return: ''' ''' 这里的rois(proposal)产生的过程: 1:先生成9*w*h个anchor的坐标--->(b,w*h*9,4) 2:根据预测的9*w*h个anchor的回归值对所有的anchor进行位置调整(超出边界的框进行修剪) 3:然后针对batch中的每一张图片进行: 1:按照前景分数取出9*w*h中前12000个框的分数以及他们的位置box(test:6000) 2:对这12000(train)个box进行nms,取出nms之后剩下的框里面的前2000个box的位置和分数(按照分数取) 3:将每张图片的2000个保留的box合并到一起(合到一个batch里面) 4:返回该batch保留的box(b,2000,5) ''' scores = input[ 0][:, self._num_anchors:, :, :] #shape=(b,9,w,h)取出预测的所有的anchor的前景概率 bbox_deltas = input[1] #=(b,4*9,w,h) im_info = input[2] #=(b,3) cfg_key = input[3] #=train pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N #train:12000 test:6000 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N #train:2000 test:300 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH #train:0.7 test:0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE #train:8 rpn的最小尺寸 test:16 batch_size = bbox_deltas.size(0) feat_height, feat_width = scores.size(2), scores.size(3) #h,w shift_x = np.arange( 0, feat_width) * self._feat_stride #[0,16,16*2,16*3,...16*h] shift_y = np.arange( 0, feat_height) * self._feat_stride #[0,16,16*2,16*3,...16*w] shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = torch.from_numpy( np.vstack( (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel() )).transpose()) #shifts.shape=(w*h,4)坐标是相对于原图的 4:[x,y,x,y] shifts = shifts.contiguous().type_as( scores).float() #shifts.shape=(w*h,4)坐标是相对于原图的 A = self._num_anchors #9 K = shifts.size(0) #w*h self._anchors = self._anchors.type_as( scores) #shape=(9,4)每个位置9个anchor的尺寸 # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous() anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) #anchors.shape=(b,w*h*9,4) #到这里就产生了默认的anchor # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() #shape=(b,w,h,4*9) bbox_deltas = bbox_deltas.view(batch_size, -1, 4) #shape=(b,w*h*9,4) # Same story for the scores: scores = scores.permute(0, 2, 3, 1).contiguous() #(b,w,h,9) scores = scores.view(batch_size, -1) #shape=(b,w*h*9) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) #proposal.shape=(b,w*h*9,4) 根据预测出来的偏移调整acnhor的位置, 4:[x1,y1,x2,y2]两个角点的坐标 # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # proposals = clip_boxes_batch(proposals, im_info, batch_size) # assign the score to 0 if it's non keep. # keep = self._filter_boxes(proposals, min_size * im_info[:, 2]) # trim keep index to make it euqal over batch # keep_idx = torch.cat(tuple(keep_idx), 0) # scores_keep = scores.view(-1)[keep_idx].view(batch_size, trim_size) # proposals_keep = proposals.view(-1, 4)[keep_idx, :].contiguous().view(batch_size, trim_size, 4) # _, order = torch.sort(scores_keep, 1, True) scores_keep = scores #shape=(b,w*h*9) proposals_keep = proposals #shape=(b,w*h*9,4) 根据预测出来的偏移调整的acnhor的位置, 4:[x1,y1,x2,y2]两个角点的坐标 _, order = torch.sort(scores_keep, 1, True) #order.shape=[b,w*h*9] w*h*9个数[2,1,0,3,,,]表示分数从高到低排序,各个框的idx(分数第2>第1》第0个》第3个框。。。。) output = scores.new(batch_size, post_nms_topN, 5).zero_() #shape=(b,2000,5) 全零 for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[ i] #shape=(w*h*9,4) 根据预测出来的偏移调整acnhor的位置, 4:[x1,y1,x2,y2]两个角点的坐标(取出每张图片上的回归之后的anchor) scores_single = scores_keep[i] #shape=(w*h*9)每张图片上预测的anchor的前景概率 # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[ i] #order_single.shape=(w*h*9) w*h*9个数[2,1,0,3,,,]表示分数从高到低排序,各个框的idx(分数第2>第1》第0个》第3个框。。。。) #到此位置取出了一张图片上的所有的调整之后的anchor,以及按分数排序的索引 if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[: pre_nms_topN] #order_single.shape=(12000)只取出高分前12000个框的索引 proposals_single = proposals_single[ order_single, :] ##shape=(12000,4)根据索引取出高分的12000个框的坐标 scores_single = scores_single[order_single].view( -1, 1) ##shape=(12000,1)根据索引取出高分的12000个框的分数 # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms( torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) #shape=(k,1),每个值代表要保留的bbox的索引 keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[: post_nms_topN] #只取nms之后保留的 前2000个 shape=(2000) ,里面的值代表要保留的box的索引 proposals_single = proposals_single[keep_idx_i, :] #shape=(2000,4) scores_single = scores_single[keep_idx_i, :] #shape=(2000) # padding 0 at the end. num_proposal = proposals_single.size(0) #=2000,得到的proposal的个数 output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single #output.shape=(b,2000,5) 5:[当前box在那个图片上(0-batchsize), x1,y1,x2,y2] return output
def forward(self, input): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs scores = input[0][:, :, 1] # batch_size x num_rois x 1 bbox_deltas = input[1] # batch_size x num_rois x 4 im_info = input[2] cfg_key = input[3] feat_shapes = input[4] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE batch_size = bbox_deltas.size(0) anchors = torch.from_numpy( generate_anchors_all_pyramids( self._fpn_scales, self._anchor_ratios, feat_shapes, self._fpn_feature_strides, self._fpn_anchor_stride)).type_as(scores) num_anchors = anchors.size(0) anchors = anchors.view(1, num_anchors, 4).expand(batch_size, num_anchors, 4) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # keep_idx = self._filter_boxes(proposals, min_size).squeeze().long().nonzero().squeeze() scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) #print(cfg.CUDA) keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, cfg.CUDA) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output
def detect_img(self, img, gpus=0): """ :param img: numpy array :return: """ im_in = img im = im_in[:, :, ::-1] blobs, im_scales = self._get_image_blob(im) im_blob = blobs im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) # output im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.data.resize_(1, 1, 5).zero_() num_boxes.data.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = self.fasterRCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(self.pascal_classes)) pred_boxes = self.bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = self.clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() all_res = {} thresh = 0.05 for j in range(1, len(self.pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] res = self.fetch_dets(self.pascal_classes[j], cls_dets.cpu().numpy(), 0.5) all_res = dict(all_res, **res) return all_res
def predict(cls, im_in): """For the input, do the predictions and return them. Args: im_in (a PIL image): The data on which to do the predictions.""" assert len(im_in.shape) == 3, "RGB images only" if cls.model is None: cls.model = cls.get_model() thresh = 0.05 with torch.no_grad(): blobs, im_scales = _get_image_blob(im_in) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_data = Variable( torch.from_numpy(im_blob).permute(0, 3, 1, 2).cuda()) im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_info = Variable(torch.from_numpy(im_info_np).cuda()) gt_boxes = Variable(torch.zeros(1, 1, 5).cuda()) num_boxes = Variable(torch.zeros(1).cuda()) rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_box, RCNN_loss_cls, RCNN_loss_bbox, rois_label = cls.model( im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(cls.model.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() result = dict() for j in range(1, len(cls.model.classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] result[cls.model.classes[j]] = cls_dets.cpu().numpy( ).tolist() return { 'pred': result, 'metrics': { 'rpn_loss_cls': rpn_loss_cls, 'rpn_loss_box': rpn_loss_box, 'RCNN_loss_cls': RCNN_loss_cls, 'RCNN_loss_bbox': RCNN_loss_bbox, 'rois_label': rois_label } }
for j in xrange(1, imdb.num_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, imdb.classes[j], cls_dets.cpu().numpy(), 0.3) all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = empty_array # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack([ all_boxes[j][i][:, -1] for j in xrange(1, imdb.num_classes) ]) if len(image_scores) > max_per_image:
def forward(self, input): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs # input输入形式为tuple = (rpn_cls_prob.data, rpn_bbox_pred.data, im_info, cfg_key) # 0-8为anchor的背景得分, 9-17为anchor的前景得分 scores = input[0][:, self._num_anchors:, :, :] # [1,9,53,37] bbox_deltas = input[1] # [1,36,53,37] im_info = input[2] cfg_key = input[3] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N # nms之前保存的建议区域数量,检测阶段为6000 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N # 通过nms后保存的建议区域数量,检测阶段为300 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # ms的阈值 检测阶段0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE # 建议区域的最小宽度或高度,检测阶段为16 batch_size = bbox_deltas.size(0) # batch_size = 1 feat_height, feat_width = scores.size(2), scores.size(3) # 53,37 shift_x = np.arange(0, feat_width) * self._feat_stride shift_y = np.arange(0, feat_height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) # 从坐标向量中返回坐标矩阵,元素交叉 # torch.from_numpy将np数据转化为tensor,将tensor转化为np:tensor.numpy() # ravel()函数与flatten()函数功能类似,将多维数组降一维,np.flatten返回拷贝,不会影响原始数据,np.ravel返回视图view # np.vstack按垂直方向(行顺序)堆叠数组构成一个新的数组 # shift_x,shift_y为[37,53]矩阵,展平后堆叠再转置,得到[1961,3]tensor shifts = torch.from_numpy(np.vstack((shift_x.ravel(),shift_y.ravel(),shift_x.ravel(),shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(scores).float() #contiguous()把tensor变为连续分布形式 A = self._num_anchors K = shifts.size(0) # 9个anchor,每个包含四个坐标偏移值,宽高中心点坐标 self._anchors = self._anchors.type_as(scores) # [9,4] # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) # [1961, 9, 4] anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) # [1, 17649, 4] # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchor bbox_deltas = bbox_deltas.permute(0,2,3,1).contiguous() # [1, 53, 37, 36] bbox_deltas = bbox_deltas.view(batch_size, -1, 4) # [1, 17649, 4] # Same story for the score scores = scores.permute(0,2,3,1).contiguous() # permute将维度换位 scores = scores.view(batch_size, -1) # [1, 17649] # Convert anchors into proposals via bbox transformations # 根据anchor和偏移量计算proposals,delta表示偏移量,返回左上和右下顶点的坐标(x1,y1,x2,y2) proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # clip predicted boxes to image,将proposals限制在图片范围内,超出边界,则将边界赋值 proposals = clip_boxes(proposals, im_info, batch_size) # proposals = clip_boxes_batch(proposals, im_info, batch_size) # assign the score to 0 if it's non keep. # keep = self._filter_boxes(proposals, min_size * im_info[:, 2]) # trim keep index to make it euqal over batch # keep_idx = torch.cat(tuple(keep_idx), 0) # scores_keep = scores.view(-1)[keep_idx].view(batch_size, trim_size) # proposals_keep = proposals.view(-1, 4)[keep_idx, :].contiguous().view(batch_size, trim_size, 4) # _, order = torch.sort(scores_keep, 1, True) scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) # _ is scores after sort,order is index after scores output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) # 从[1,17949,4]转换到[17649,4],从[1, 17649]转换到[17649] proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] # numel函数返回元素个数 if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] # 测试阶段取前6000个得分的索引 # 取前6000的索引对应的区域和得分,[6000,4],[6000,1],这里会重新生成proposals_single的下标0:5999 proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1,1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) # torch.cat 在第1维度拼接区域和得分矩阵,[6000,5] keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) # keep_idx_i 返回通过nms阈值限制之后的索引,该索引基于6000的下标[102,1]或[561,1] keep_idx_i = keep_idx_i.long().view(-1) # 取该索引的前300个建议区域 if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end.,将不足300的建议区域补0 num_proposal = proposals_single.size(0) output[i,:,0] = i output[i,:num_proposal,1:] = proposals_single return output
# else: # all_boxes[j][i] = empty_array for j in range(1, class_num): inds = torch.nonzero(scores[:, j] > thresh).view(-1) inds_threshold = torch.nonzero(scores[:, j] > 0.5).view(-1) print( 'inds > 0.05 num = {},inds_threshold > 0.5 ={} {}'.format( inds.numel(), inds_threshold.numel(), class_name[j])) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) cls_boxes = pred_boxes[inds, :] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, 0.2, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, class_name[j], cls_dets.cpu().numpy(), args.thresh_class) misc_toc = time.time() nms_time = misc_toc - misc_tic prefix_img = img_filelist[8][0].split("/")[5] result_path = 'result/{}_{}.jpg'.format(prefix_img, str(epoch)) cv2.imwrite(result_path, im2show)
def forward( self, input ): # input=(rpn_cls_prob, rpn_twin_pred, cfg_key) [(1,20,96,1,1), (1,20,96,1,1), 'TRAIN'] # Algorithm: # # for each (H, W) location i # generate A anchor twins centered on cell i # apply predicted twin deltas at cell i to each of the A anchors # clip predicted twins to video # remove predicted twins with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs scores = input[ 0][:, self. _num_anchors:, :, :, :] # rpn_cls_prob (1,10,96,1,1) 貌似只取了前景 twin_deltas = input[1] # rpn_twin_pred (1,20,96,1,1) cfg_key = input[2] # 'TRAIN' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N # 12000 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N # 2000 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # 0.8 min_size = cfg[cfg_key].RPN_MIN_SIZE # 0 # 1. Generate proposals from twin deltas and shifted anchors length, height, width = scores.shape[-3:] # (96,1,1) if DEBUG: print('score map size: {}'.format(scores.shape)) batch_size = twin_deltas.size(0) # 1 # Enumerate all shifts shifts = np.arange( 0, length) * self._feat_stride # shifts = np.arange(0, 96) * 8 shifts = torch.from_numpy(shifts.astype(float)) shifts = shifts.contiguous().type_as( scores) # shifts = np.arange(0, 96) * 8 # print(shifts.shape) # torch.Size([96]) # Enumerate all shifted anchors: # # add A anchors (1, A, 2) to # cell K shifts (K, 1, 1) to get # shift anchors (K, A, 2) # reshape to (1, K*A, 2) shifted anchors # expand to (batch_size, K*A, 2) A = self._num_anchors # 10 K = shifts.shape[0] # 96 self._anchors = self._anchors.type_as(scores) anchors = self._anchors.view(1, A, 2) + shifts.view(K, 1, 1) # (96,10,2) anchors = anchors.view(1, K * A, 2).expand(batch_size, K * A, 2) # (1, 960, 2) # Transpose and reshape predicted twin transformations to get them # into the same order as the anchors: # # twin deltas will be (batch_size, 2 * A, L, H, W) format # transpose to (batch_size, L, H, W, 2 * A) # reshape to (batch_size, L * H * W * A, 2) where rows are ordered by (l, h, w, a) # in slowest to fastest order twin_deltas = twin_deltas.permute( 0, 2, 3, 4, 1).contiguous() # rpn_twin_pred (1,96,1,1,20) twin_deltas = twin_deltas.view(batch_size, -1, 2) # rpn_twin_pred (1,96*1*1*10,2) # Same story for the scores: # # scores are (batch_size, A, L, H, W) format # transpose to (batch_size, L, H, W, A) # reshape to (batch_size, L * H * W * A) where rows are ordered by (l, h, w, a) scores = scores.permute(0, 2, 3, 4, 1).contiguous() # rpn_cls_prob (1,96,1,1,10) scores = scores.view(batch_size, -1) # rpn_cls_prob (1,96*1*1*10) # Convert anchors into proposals via twin transformations # (1,960,2),(1,960,2),1 proposals = twin_transform_inv( anchors, twin_deltas, batch_size ) #(960个原始锚框,偏移,batch_size)(原始锚框第一列表示起始帧,第二列表示结束帧)(偏移第一列表示中心偏移,第二列表示长度偏移) # 预测的新锚框(1,960,2)第一列表示预测起始帧,第二列表示预测结束帧 # rpn网络里的回归 # 2. clip predicted wins to video # (1,960,2), 96*8, 1 proposals = clip_twins(proposals, length * self._feat_stride, batch_size) # 把proposals值范围抑制在(0,96*8)之间,其实没起作用 # 3. remove predicted twins with either length < threshold # assign the score to 0 if it's non keep. no_keep = self._filter_twins_reverse( proposals, min_size) # 去除小于min_size的窗口,但实际min_size=0,所以此句无用 scores[no_keep] = 0 # scores是前景(1, 960) 每个值对应每帧图片是前景的概率 scores_keep = scores # 二分类(1,960)前景的概率 proposals_keep = proposals # 回归(1,960,2)预测起始帧 预测结束帧 # sorted in descending order _, order = torch.sort( scores_keep, 1, True) # (1,960)order是(0~959(scores里的下标))构成的列表,表示scores里的概率按从大到小排列 # print ("scores_keep {}".format(scores_keep.shape)) # print ("proposals_keep {}".format(proposals_keep.shape)) # print ("order {}".format(order.shape)) output = scores.new(batch_size, post_nms_topN, 3).zero_() # (1,2000,3)全0的tensor类型列表 if self._out_scores: # False output_score = scores.new(batch_size, post_nms_topN, 2).zero_() for i in range(batch_size): proposals_single = proposals_keep[i] # (960,2) 预测起始帧 预测结束帧 scores_single = scores_keep[i] # (960)前景的概率 # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] # (960)scores里的下标,scores里的概率按从大到小排列 if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel( ): # False order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[ order_single, :] #(960,2)把proposals里的960个特征按其是前景概率的大小从大到小排列,后面两列仍然是预测起始和结束帧 scores_single = scores_single[order_single].view( -1, 1) #(960,1)把proposals里的960个特征按其是前景概率的大小从大到小排列,后面一列是对应的从大到小的概率 # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms( torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS ) #scores_single并在proposals_single的右侧形成(960,3),然后经过nms函数 # keep_idx_i(<960, 1),取出scores_single>0.8的行作为前景,那列为>0.8时960个特征对应的索引 keep_idx_i = keep_idx_i.long().view(-1) # keep_idx_i(<960) if post_nms_topN > 0: keep_idx_i = keep_idx_i[: post_nms_topN] # 没啥变化,post_nms_topN=2000,而keep_idx_i只有<960个数 # keep_idx_i(<960) proposals_single = proposals_single[ keep_idx_i, :] # (<960,2)取出经过nms抑制后的proposals_single,后面两列是可能的前景的起止帧 scores_single = scores_single[ keep_idx_i, :] # (<960,1)取出经过nms抑制后的scores_single,后面一列是可能的前景概率 # padding 0 at the end. num_proposal = proposals_single.size( 0) # <960个,经过nms抑制后proposal的个数 # print ("num_proposal: ", num_proposal) output[i, :, 0] = i # (1,2000,3)仍然全0 output[ i, :num_proposal, 1:] = proposals_single #(1,2000,3)[其中(1,<960,3)<960的部分是前景,第一列全0存放未来的21类标签,后两列是可能的前景的起止帧;(960,2000)的部分全0,可能代表背景] if self._out_scores: # False output_score[i, :, 0] = i output_score[i, :num_proposal, 1] = scores_single if self._out_scores: # False return output, output_score else: return output #(1,2000,3)[其中(1,<960,3)<960的部分是前景,第一列全0存放未来的21类标签,后两列是可能的前景的起止帧;(960,2000)的部分全0,可能代表背景]
def forward(self, input): # input[0]: rpn_cls_prob.data [batch_size, 18, H, W] # input[1]: rpn_bbox_pred.data [batch_size, 36, H, W] # input[2]: im_info [h,w,ratio] # input[3]: cfg_key scores = input[0][:, self._num_anchors:, :, :] bbox_deltas = input[1] im_info = input[2] cfg_key = input[3] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N # 12000 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N # 2000 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # 0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE # 8 batch_size = bbox_deltas.size(0) feat_hegiht, feat_width = scores.size(2), scores.size(3) #shift_x:[W]->[0, 16, 32, 48...,(W-1)*16] shift_x = np.arange(0, feat_width) * self._feat_stride #shift_Y:[H]->[0, 16, 32, 48...,(H-1)*16] shift_y = np.arange(0, feat_hegiht) * self._feat_stride #shift_x:[H, W]->[[0, 16, 32, 48...,(W-1)*16], # [0, 16, 32, 48...,(W-1)*16], # .............. ] #shift_y:[H, W]->[[0, 0, 0, 0...] # [16,16,16,16..] # ........... # [(H-1)*16,....]] shift_x, shift_y = np.meshgrid(shift_x, shift_y) #shifts:[H*W, 4]->[[0, 0, 0, 0], # .......... # [(W-1)*16, 0, (W-1)*16, 0], # [ 0 ,16, 0, 16], # ............ # [(W-1)*16, 16, (W-1)*16, 16], # ............ # [(W-1)*16, (H-1)*16, (W-1)*16, (H-1)*16]] # shifts = torch.from_numpy( np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(scores).float() A = self._num_anchors #9 K = shifts.size(0) #feature_map->(H, W) -> H * W = K self._anchors = self._anchors.type_as(scores) #anchors:[K, A, 4] 《=》[H*W, A, 4] anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) #bbox_delta:[batch_size, 36, H, W] => [batch_size, H, W, 36(9 anchors * 4)] bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() #bbox_delta:[batch_size, H, W, 36(9 anchors * 4)] => [batch_size, H*w*9, 4] bbox_deltas = bbox_deltas.view(batch_size, -1, 4) #scores:[batch_size, 9, H, W] => [batch_szie, H, W, 9] scores = scores.permute(0, 2, 3, 1).contiguous() #scores:[batch_szie, H, W, 9] => [batch_size, H*W*9] scores = scores.view(batch_size, -1) #1.convert anchors into proposals proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) #2.clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) scores_keep = scores proposals_keep = proposals #1:维度,True代表降序(1:Which dimension is sorted; True:descending order) _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): #3.remove predicted boxes with either height or width < threshold proposals_single = proposals_keep[i] scores_single = scores[i] order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores.numel(): order_single = order_single[:pre_nms_topN] # proposal_single:[batch_size, pre_nms_topN, 4] # scores_single : [batch_size, pre_nms_topN, 1] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) #output[i,:,0]是为了区分一个batch中的不同图片, #因为这些推荐框是在不同的feature_map上进行后续的选取 output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output