def prosess(img_path): im_in = np.array(imread(img_path)) # to RGB im = im_in[:, :, ::-1] # get input inform blobs, im_scales = _get_image_blob(im) im_blob = blobs im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) # 准备input im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) im_data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.resize_(1, 1, 5).zero_() num_boxes.resize_(1).zero_() # detection rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = model(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] box_deltas = bbox_pred.data box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * 2) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() inds = torch.nonzero(scores[:, 1] > 0.05).view(-1) # show image im2show = np.copy(im) # if there is det if inds.numel() > 0: cls_scores = scores[:, 1][inds] _, order = torch.sort(cls_scores, 0, True) cls_boxes = pred_boxes[inds][:, 4:8] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] # keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] # 调试可视化 im2show = vis_detections(im2show, 'tampered', cls_dets.cpu().numpy(), 0.5) # 输出结果 # cv2.imshow('test', im2show) # cv2.waitKey(0) else: print('No bbox!') # 暂存结果 cv2.imwrite('temp.jpg', im2show) # to PIL img = Image.open('temp.jpg') img = img.resize((640, 480)) img = ImageTk.PhotoImage(img) label_img.config(image=img) label_img.image = img pass
def eval_result(args, logger, epoch, output_dir): if torch.cuda.is_available() and not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) args.batch_size = 1 imdb, roidb, ratio_list, ratio_index = combined_roidb( args.imdbval_name, False) imdb.competition_mode(on=True) load_name = os.path.join(output_dir, 'thundernet_epoch_{}.pth'.format(epoch, )) layer = int(args.net.split("_")[1]) _RCNN = snet(imdb.classes, layer, pretrained=False, class_agnostic=args.class_agnostic) _RCNN.create_architecture() print("load checkpoint %s" % (load_name)) if args.cuda: checkpoint = torch.load(load_name) else: checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage ) # Load all tensors onto the CPU _RCNN.load_state_dict(checkpoint['model']) im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) # ship to cuda if args.cuda: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() # make variable (PyTorch 0.4.0+) with torch.no_grad(): im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) if args.cuda: cfg.CUDA = True if args.cuda: _RCNN.cuda() start = time.time() max_per_image = 100 vis = True if vis: thresh = 0.05 else: thresh = 0.0 save_name = 'thundernet' num_images = len(imdb.image_index) all_boxes = [[[] for _ in xrange(num_images)] for _ in xrange(imdb.num_classes)] output_dir = get_output_dir(imdb, save_name) dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ imdb.num_classes, training=False, normalize=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) data_iter = iter(dataloader) _t = {'im_detect': time.time(), 'misc': time.time()} det_file = os.path.join(output_dir, 'detections.pkl') _RCNN.eval() empty_array = np.transpose(np.array([[], [], [], [], []]), (1, 0)) for i in range(num_images): data = next(data_iter) with torch.no_grad(): im_data.resize_(data[0].size()).copy_(data[0]) im_info.resize_(data[1].size()).copy_(data[1]) gt_boxes.resize_(data[2].size()).copy_(data[2]) num_boxes.resize_(data[3].size()).copy_(data[3]) det_tic = time.time() with torch.no_grad(): time_measure, \ rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = _RCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(args.batch_size, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(args.batch_size, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= data[1][0][2].item() scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im = cv2.imread(imdb.image_path_at(i)) im2show = np.copy(im) for j in xrange(1, imdb.num_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: vis_detections(im2show, imdb.classes[j], color_list[j].tolist(), cls_dets.cpu().numpy(), 0.6) all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = empty_array # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack( [all_boxes[j][i][:, -1] for j in xrange(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in xrange(1, imdb.num_classes): keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] misc_toc = time.time() nms_time = misc_toc - misc_tic sys.stdout.write( 'im_detect: {:d}/{:d}\tDetect: {:.3f}s (RPN: {:.3f}s, Pre-RoI: {:.3f}s, RoI: {:.3f}s, Subnet: {:.3f}s)\tNMS: {:.3f}s\r' \ .format(i + 1, num_images, detect_time, time_measure[0], time_measure[1], time_measure[2], time_measure[3], nms_time)) sys.stdout.flush() if vis and i % 200 == 0 and args.use_tfboard: im2show = im2show[:, :, ::-1] logger.add_image('pred_image_{}'.format(i), trans.ToTensor()(Image.fromarray( im2show.astype('uint8'))), global_step=i) # cv2.imwrite('result.png', im2show) # pdb.set_trace() # cv2.imshow('test', im2show) # cv2.waitKey(0) with open(det_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') ap_50 = imdb.evaluate_detections(all_boxes, output_dir) logger.add_scalar("map_50", ap_50, global_step=epoch) end = time.time() print("test time: %0.4fs" % (end - start))
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map base_feat = self.RCNN_base(im_data) # feed base feature map tp RPN to obtain rois rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( base_feat, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data rois_label = Variable(rois_label.view(-1).long()) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: rois_label = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = Variable(rois) # do roi pooling based on predicted rois if cfg.POOLING_MODE == 'crop': # pdb.set_trace() # pooled_feat_anchor = _crop_pool_layer(base_feat, rois.view(-1, 5)) grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat = F.max_pool2d(pooled_feat, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # feed pooled features to top model pooled_feat = self._head_to_tail(pooled_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) _bbox_pred = bbox_pred.clone() if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, 1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # classification loss RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) # bounding box regression L1 loss RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) ## Get global and local region from Faster R-CNN cuda = True pascal_classes = np.array([ '__background__', 'Blue mackerel', 'Chub mackerel', 'Hybrid', 'Blue mackerel redline', 'Chub mackerel redline', 'Hybrid redline' ]) scores = cls_prob.data boxes = rois.data[:, :, 1:5] box_deltas = _bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: if self.class_agnostic: if cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() print(pred_boxes.data.cpu().numpy().shape) print(scores.data.cpu().numpy().shape) print(scores) # get global region thresh = 0.05 region_g = np.ndarray((0, 5)) region_l = np.ndarray((0, 5)) for j in range(1, 4): inds = torch.nonzero(scores[:, j] > thresh).view(-1) inds_l = torch.nonzero(scores[:, j + 3] > thresh).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] cls_scores_l = scores[:, j + 3][inds] _, order = torch.sort(cls_scores, 0, True) _, order_l = torch.sort(cls_scores_l, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds] cls_boxes_l = pred_boxes[inds_l] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_boxes_l = pred_boxes[inds_l][:, (j + 3) * 4:(j + 4) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) cls_dets_l = torch.cat( (cls_boxes_l, cls_scores_l.unsqueeze(1)), 1) cls_dets = cls_dets[order] cls_dets_l = cls_dets_l[order_l] ind = np.argmax(cls_dets[..., -1]) ind_l = np.argmax(cls_dets_l[..., -1]) region_g = np.vstack((region_g, cls_des[ind])) region_g = np.vstack((region_l, cls_dets_l[ind])) #keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) #cls_dets = cls_dets[keep.view(-1).long()] print(region_g) print(region_l) region_g = region_g[np.argmax(region_g[..., -1])] region_l = region_l[np.argmax(region_l[..., -1])] print(region_g) print(region_l) ## GLCC # global region if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat_g = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_g = F.max_pool2d(pooled_feat_g, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_g = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_g = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) # local region if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois.view(-1, 5), base_feat.size()[2:], self.grid_size) grid_yx = torch.stack( [grid_xy.data[:, :, :, 1], grid_xy.data[:, :, :, 0]], 3).contiguous() pooled_feat_l = self.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_l = F.max_pool2d(pooled_feat_l, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_l = self.RCNN_roi_align(base_feat, rois.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_l = self.RCNN_roi_pool(base_feat, rois.view(-1, 5)) x = torch.cat((pooled_feat_g, pooled_feat_l), dim=0) x = self.glcc_conv1(x) x = F.Relu() x = self.glcc_fc1(x) x = F.Relu(x) x = nn.Dropout2d()(x) x = self.glcc_fc2(x) x = F.Relu(x) x = nn.Droopout2d()(x) x = self.glcc_fc_out(x) #GLCC_loss_cls = 0 #if self.training: # GLCC_loss_cls = F.cross_entropy(x, t) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, x
if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev # FIXME: pytorch normalize if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im2show = np.copy(im) for j in xrange(1, len(classes)):
+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view( n_legs, batch_size, -1, 4 * len(imagenet_vid_classes)) pred_boxes = bbox_transform_inv_legs(boxes, box_deltas, batch_size) pred_boxes = clip_boxes(pred_boxes, im_info.data, batch_size) else: # Simply repeat the boxes, once for each class raise NotImplementedError trk_box_deltas = tracking_pred.unsqueeze(0).data #TODO Check whether this is necessary trk_box_deltas = trk_box_deltas.view(-1,4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() trk_box_deltas = trk_box_deltas.view(1, -1, 4) pred_trk_boxes = bbox_transform_inv(trk_boxes, trk_box_deltas, 1) pred_trk_boxes = clip_boxes(pred_trk_boxes, im_info.permute(1, 0, 2)[0].data, 1) # Assume scales are same for frames in the same video im_scale = im_info.data.squeeze(0)[0][-1] #im_scales = im_info[:,:,2].data.contiguous().view(1,-1,1,1).permute(1,0,2,3) pred_boxes /= im_scale pred_trk_boxes /= im_scale # squeeze batch dim #scores = scores.squeeze(1) #pred_boxes = pred_boxes.squeeze(1) #pred_trk_boxes = pred_trk_boxes.squeeze(0) # Permute such that we have (frame_sample_id, n_legs, n_boxes, ...) pred_boxes = pred_boxes.permute( 1, 0, 2, 3).contiguous() #2*1*300*4=>1*2*300*4
def det_im(self, im_file): max_per_image = 100 thresh = 0.05 total_tic = time.time() # im = cv2.imread(im_file) im_in = np.array(imread(im_file)) if len(im_in.shape) == 2: im_in = im_in[:, :, np.newaxis] im_in = np.concatenate((im_in, im_in, im_in), axis=2) # rgb -> bgr im = im_in[:, :, ::-1] blobs, im_scales = self._get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) self.im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) self.im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) self.gt_boxes.data.resize_(1, 1, 5).zero_() self.num_boxes.data.resize_(1).zero_() det_tic = time.time() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = self.fasterRCNN(self.im_data, self.im_info, self.gt_boxes, self.num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.args.class_agnostic: if self.args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if self.args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(self.vrd_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, self.im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() im2show = np.copy(im) res = {} res['box'] = np.zeros((0, 4)) res['cls'] = [] res['confs'] = [] for j in xrange(1, len(self.vrd_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if self.args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] im2show = res_detections(im2show, j, self.vrd_classes[j], cls_dets.cpu().numpy(), res, 0.5) misc_toc = time.time() nms_time = misc_toc - misc_tic sys.stdout.write('im_detect: {:.3f}s {:.3f}s \r'.format( detect_time, nms_time)) sys.stdout.flush() cv2.imwrite('img/im_det.jpg', im2show) return res
def objgrasp_inference(o_cls_prob, o_box_output, g_cls_prob, g_box_output, im_info, rois=None, class_agnostic=True, n_classes=None, g_box_prior=None, for_vis=False, topN_g=False, recover_imscale=True): """ :param o_cls_prob: N x N_cls tensor :param o_box_output: N x 4 tensor :param g_cls_prob: N x K*A x 2 tensor :param g_box_output: N x K*A x 5 tensor :param im_info: size 4 tensor :param rois: N x 4 tensor :param g_box_prior: N x K*A * 5 tensor :return: Note: 1 This function simultaneously supports ROI-GN with or without object branch. If no object branch, o_cls_prob and o_box_output will be none, and object detection results are shown in the form of ROIs. 2 This function can only detect one image per invoking. """ o_scores = o_cls_prob rois = rois[:, 1:5] g_scores = g_cls_prob if for_vis: o_thresh = 0.5 else: o_thresh = 0. topN_g = 1 if not topN_g: g_thresh = 0.5 else: g_thresh = 0. if rois is None: raise RuntimeError("You must specify rois for ROI-GN.") if g_box_prior is None: raise NotImplementedError( "Inference for anchor free algorithms has not been implemented.") # infer grasp boxes normalizer = { 'mean': cfg.FCGN.BBOX_NORMALIZE_MEANS, 'std': cfg.FCGN.BBOX_NORMALIZE_STDS } g_box_output = box_unnorm_torch(g_box_output, normalizer, d_box=5, class_agnostic=True, n_cls=None) g_box_output = g_box_output.view(g_box_prior.size()) # N x K*A x 5 grasp_pred = grasp_decode(g_box_output, g_box_prior) # N x K*A x 1 rois_w = (rois[:, 2] - rois[:, 0]).view(-1).unsqueeze(1).unsqueeze(2).expand_as( grasp_pred[:, :, 0:1]) rois_h = (rois[:, 3] - rois[:, 1]).view(-1).unsqueeze(1).unsqueeze(2).expand_as( grasp_pred[:, :, 1:2]) keep_mask = (grasp_pred[:, :, 0:1] > 0) & (grasp_pred[:, :, 1:2] > 0) & \ (grasp_pred[:, :, 0:1] < rois_w) & (grasp_pred[:, :, 1:2] < rois_h) grasp_scores = g_scores.contiguous().view(rois.size(0), -1, 2) # N x 1 x 1 xleft = rois[:, 0].view(-1).unsqueeze(1).unsqueeze(2) ytop = rois[:, 1].view(-1).unsqueeze(1).unsqueeze(2) # rois offset grasp_pred[:, :, 0:1] = grasp_pred[:, :, 0:1] + xleft grasp_pred[:, :, 1:2] = grasp_pred[:, :, 1:2] + ytop # N x K*A x 8 grasp_pred_boxes = labels2points(grasp_pred).contiguous().view( rois.size(0), -1, 8) # N x K*A grasp_pos_scores = grasp_scores[:, :, 1] if topN_g: # N x K*A _, grasp_score_idx = torch.sort(grasp_pos_scores, dim=-1, descending=True) _, grasp_idx_rank = torch.sort(grasp_score_idx, dim=-1) # N x K*A mask topn_grasp = topN_g grasp_maxscore_mask = (grasp_idx_rank < topn_grasp) # N x topN grasp_maxscores = grasp_pos_scores[grasp_maxscore_mask].contiguous( ).view(rois.size()[:1] + (topn_grasp, )) # N x topN x 8 grasp_pred_boxes = grasp_pred_boxes[grasp_maxscore_mask].view( rois.size()[:1] + (topn_grasp, 8)) else: raise NotImplementedError( "Now ROI-GN only supports top-N grasp detection for each object.") # infer object boxes if cfg.TRAIN.COMMON.BBOX_REG: if cfg.TRAIN.COMMON.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: normalizer = { 'mean': cfg.TRAIN.COMMON.BBOX_NORMALIZE_MEANS, 'std': cfg.TRAIN.COMMON.BBOX_NORMALIZE_STDS } box_output = box_unnorm_torch(o_box_output, normalizer, 4, class_agnostic, n_classes) pred_boxes = bbox_transform_inv(rois, box_output, 1) pred_boxes = clip_boxes(pred_boxes, im_info, 1) else: pred_boxes = rois.clone() if recover_imscale: pred_boxes = box_recover_scale_torch(pred_boxes, im_info[3], im_info[2]) grasp_pred_boxes = box_recover_scale_torch(grasp_pred_boxes, im_info[3], im_info[2]) all_box = [[]] all_grasp = [[]] for j in xrange(1, n_classes): if class_agnostic or not cfg.TRAIN.COMMON.BBOX_REG: cls_boxes = pred_boxes else: cls_boxes = pred_boxes[:, j * 4:(j + 1) * 4] cls_dets, cls_scores, box_keep_inds = box_filter(cls_boxes, o_scores[:, j], o_thresh, use_nms=True) cls_dets = np.concatenate((cls_dets, np.expand_dims(cls_scores, -1)), axis=-1) grasps = (grasp_pred_boxes.cpu().numpy())[box_keep_inds] if for_vis: cls_dets[:, -1] = j else: grasps = np.squeeze(grasps, axis=1) all_box.append(cls_dets) all_grasp.append(grasps) if for_vis: all_box = np.concatenate(all_box[1:], axis=0) all_grasp = np.concatenate(all_grasp[1:], axis=0) return all_box, all_grasp
if cfg.TEST.BBOX_REG: # Calculate prediction boxes (x1,x2,y1,y2) using bboxes and offset box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv( boxes, box_deltas, 1) # Change proposed rois to predicted bbox by using deltas pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) # Clip bboxes to fit the image size else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) # Affine transform the prediction boxes pixel by the scale of the patch pred_boxes /= data[1][0][2].item( ) # data[1] : [850, 600, 1.6997] (H, W, scale) ######################################################################################### # conf_threshold, nms_threshold, max_boxes, n_classes, coord_h, coord_w = # post_proc = PostProc(conf_threshold, nms_threshold, max_boxes, n_classes, coord_h, coord_w) scores = scores.squeeze()
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data # feed image data to base model to obtain base feature map # Bottom-up c1 = self.RCNN_layer0(im_data) c2 = self.RCNN_layer1(c1) c3 = self.RCNN_layer2(c2) c4 = self.RCNN_layer3(c3) c5 = self.RCNN_layer4(c4) # Top-down p5 = self.RCNN_toplayer(c5) p4 = self._upsample_add(p5, self.RCNN_latlayer1(c4)) p4 = self.RCNN_smooth1(p4) p3 = self._upsample_add(p4, self.RCNN_latlayer2(c3)) p3 = self.RCNN_smooth2(p3) p2 = self._upsample_add(p3, self.RCNN_latlayer3(c2)) p2 = self.RCNN_smooth3(p2) p6 = self.maxpool2d(p5) rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn( rpn_feature_maps, im_info, gt_boxes, num_boxes) # if it is training phrase, then use ground trubut bboxes for refining if self.training: roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) rois, rois_label, gt_assign, rois_target, rois_inside_ws, rois_outside_ws = roi_data ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois = rois.view(-1, 5) rois_label = rois_label.view(-1).long() gt_assign = gt_assign.view(-1).long() pos_id = rois_label.nonzero().squeeze() gt_assign_pos = gt_assign[pos_id] rois_label_pos = rois_label[pos_id] rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) rois_label = Variable(rois_label) rois_target = Variable(rois_target.view(-1, rois_target.size(2))) rois_inside_ws = Variable( rois_inside_ws.view(-1, rois_inside_ws.size(2))) rois_outside_ws = Variable( rois_outside_ws.view(-1, rois_outside_ws.size(2))) else: ## NOTE: additionally, normalize proposals to range [0, 1], # this is necessary so that the following roi pooling # is correct on different feature maps # rois[:, :, 1::2] /= im_info[0][1] # rois[:, :, 2::2] /= im_info[0][0] rois_label = None gt_assign = None rois_target = None rois_inside_ws = None rois_outside_ws = None rpn_loss_cls = 0 rpn_loss_bbox = 0 rois = rois.view(-1, 5) pos_id = torch.arange(0, rois.size(0)).long().type_as(rois).long() rois_label_pos_ids = pos_id rois_pos = Variable(rois[pos_id]) rois = Variable(rois) # pooling features based on rois, output 14x14 map (128,64,7,7) roi_pool_feat = self._PyramidRoI_Feat(mrcnn_feature_maps, rois, im_info) Use_emsemble = False emsemble_vgg, emsemble_detnet = [False, True] if Use_emsemble: if emsemble_vgg: model_vgg = Cnn() model_vgg = model_vgg.cuda() ## vgg net pretrained_model_vgg = '/home/lab30202/lq/ai_future/single_classsification_vgg/model_save/galxay_star_classification_vgg.pth' # 预训练模型参数保存地址 pretrained_dict = torch.load(pretrained_model_vgg) model_dict = model_vgg.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict) model_vgg.load_state_dict(model_dict) feature_map_vgg = model_vgg.convnet(im_data) if self.training: idx_l = [x for x in range(0, 128, 1)] else: idx_l = [x for x in range(0, 300, 1)] idx_l = torch.LongTensor(idx_l) feat = self.RCNN_roi_align(feature_map_vgg, rois[idx_l], 0.5) roi_pool_vgg = feat.view(feat.shape[0], -1) cls_score_vgg = model_vgg.fc(roi_pool_vgg) # cls_prob_vgg = F.softmax(cls_score_vgg,dim=1) if emsemble_detnet: ## detnet detnet = Detnet() detnet = detnet.cuda() # Bottom-up c1_det = detnet.RCNN_layer0_det(im_data) c2_det = detnet.RCNN_layer1_det(c1_det) c3_det = detnet.RCNN_layer2_det(c2_det) c4_det = detnet.RCNN_layer3_det(c3_det) c5_det = detnet.RCNN_layer4_det(c4_det) c6_det = detnet.RCNN_layer5_det(c5_det) # Top-down p6_det = detnet.RCNN_toplayer_det(c6_det) p5_det = detnet.RCNN_latlayer1_det(c5_det) + p6_det p4_det = detnet.RCNN_latlayer2_det(c4_det) + p5_det p3_det = detnet._upsample_add( p4_det, detnet.RCNN_latlayer3_det(c3_det)) p3_det = detnet.RCNN_smooth1_det(p3_det) p2_det = detnet._upsample_add( p3_det, detnet.RCNN_latlayer4_det(c2_det)) p2_det = detnet.RCNN_smooth2_det(p2_det) rpn_feature_maps_det = [p2_det, p3_det, p4_det, p5_det, p6_det] mrcnn_feature_maps_det = [p2_det, p3_det, p4_det, p5_det] rois_det, rpn_loss_cls_det, rpn_loss_bbox_det = self.RCNN_rpn( rpn_feature_maps_det, im_info, gt_boxes, num_boxes) if self.training: roi_data_det = self.RCNN_proposal_target( rois_det, gt_boxes, num_boxes) rois_det, rois_label_det, gt_assign_det, rois_target_det, rois_inside_ws_det, rois_outside_ws_det = roi_data_det rois_det = rois_det.view(-1, 5) rois_label_det = rois_label_det.view(-1).long() gt_assign_det = gt_assign_det.view(-1).long() pos_id_det = rois_label_det.nonzero().squeeze() gt_assign_pos_det = gt_assign_det[pos_id_det] rois_label_pos_det = rois_label_det[pos_id_det] rois_label_pos_ids_det = pos_id_det rois_pos_det = Variable(rois_det[pos_id_det]) rois_det = Variable(rois_det) rois_label_det = Variable(rois_label_det) rois_target_det = Variable( rois_target_det.view(-1, rois_target_det.size(2))) rois_inside_ws_det = Variable( rois_inside_ws_det.view(-1, rois_inside_ws_det.size(2))) rois_outside_ws_det = Variable( rois_outside_ws_det.view(-1, rois_outside_ws_det.size(2))) else: rois_label_det = None gt_assign_det = None rois_target_det = None rois_inside_ws_det = None rois_outside_ws_det = None rpn_loss_cls_det = 0 rpn_loss_bbox_det = 0 rois_det = rois_det.view(-1, 5) pos_id_det = torch.arange( 0, rois_det.size(0)).long().type_as(rois_det).long() rois_label_pos_ids_det = pos_id_det rois_pos_det = Variable(rois_det[pos_id_det]) rois_det = Variable(rois_det) feat_det = self._PyramidRoI_Feat(mrcnn_feature_maps_det, rois, im_info) if emsemble_detnet: pooled_feat_det = detnet._head_to_tail(feat_det) cls_score_det = self.RCNN_cls_score(pooled_feat_det) else: roi_pool_det = feat_det.view(feat_det.shape[0], -1) cls_score_det = model_vgg.fc(roi_pool_det) pooled_feat = self._head_to_tail(roi_pool_feat) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(pooled_feat) if self.training and not self.class_agnostic: # select the corresponding columns according to roi labels bbox_pred_view = bbox_pred.view(bbox_pred.size(0), int(bbox_pred.size(1) / 4), 4) bbox_pred_select = torch.gather( bbox_pred_view, 1, rois_label.long().view(rois_label.size(0), 1, 1).expand(rois_label.size(0), 1, 4)) bbox_pred = bbox_pred_select.squeeze(1) # compute object classification probability cls_score = self.RCNN_cls_score(pooled_feat) # cls_prob = F.softmax(cls_score,dim=1) if Use_emsemble: if emsemble_detnet and emsemble_vgg: cls_score_liner = 0.5 * cls_score + 0.3 * cls_score_vgg + 0.2 * cls_score_det cls_score = model_vgg.fc_new(cls_score_liner) cls_prob = F.softmax(cls_score, dim=1) elif emsemble_vgg and not emsemble_detnet: cls_score_liner = cls_score + cls_score_vgg cls_score = model_vgg.fc_new(cls_score_liner) cls_prob = F.softmax(cls_score, dim=1) elif emsemble_detnet and not emsemble_vgg: cls_score_liner = cls_score + cls_score_det cls_score = detnet.fc_add(cls_score_liner) cls_prob = F.softmax(cls_score, dim=1) else: cls_score = self.RCNN_cls_score(pooled_feat) cls_prob = F.softmax(cls_score, dim=1) RCNN_loss_cls = 0 RCNN_loss_bbox = 0 if self.training: # loss (cross entropy) for object classification Use_focal_loss = True Use_label_smoothing = False Use_Giou_loss = False if not Use_focal_loss: if Use_label_smoothing: # criteria = LabelSmoothSoftmaxCE(label_smoothing=0.1) criteria = LabelSmoothSoftmaxCE(lb_pos=0.9, lb_neg=5e-3) RCNN_loss_cls = criteria(cls_score, rois_label) else: RCNN_loss_cls = F.cross_entropy(cls_score, rois_label) else: FL = FocalLoss(class_num=self.n_classes, alpha=1, gamma=2) RCNN_loss_cls = FL(cls_score, rois_label) RCNN_loss_cls = RCNN_loss_cls.type(torch.FloatTensor).cuda() # loss (l1-norm) for bounding box regression if Use_Giou_loss: rois1 = rois.view(batch_size, -1, rois.size(1)) boxes = rois1.data[:, :, 1:5] bbox_pred1 = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) box_deltas = bbox_pred1.data # if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # # Optionally normalize targets by a precomputed mean and stdev # box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ # + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() # box_deltas = box_deltas.view(1, -1, 4 * len(self.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) pred_boxes /= im_info[0][2].cuda() # RCNN_loss_bbox = generalized_iou_loss(rois_target,bbox_pred) _, _, RCNN_loss_bbox = Giou_np(pred_boxes, boxes) else: RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rois = rois.view(batch_size, -1, rois.size(1)) cls_prob = cls_prob.view(batch_size, -1, cls_prob.size(1)) bbox_pred = bbox_pred.view(batch_size, -1, bbox_pred.size(1)) if self.training: rois_label = rois_label.view(batch_size, -1) rpn_loss_cls = torch.unsqueeze(rpn_loss_cls, 0) rpn_loss_bbox = torch.unsqueeze(rpn_loss_bbox, 0) RCNN_loss_cls = torch.unsqueeze(RCNN_loss_cls, 0) RCNN_loss_bbox = torch.unsqueeze(RCNN_loss_bbox, 0) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label
def _get_single_obj_det_results(self, rois, cls_prob, bbox_pred, im_info): scores = cls_prob.data boxes = rois.data[:, :, 1:5] results = [] if cfg.TEST.COMMON.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.COMMON.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.COMMON.BBOX_NORMALIZE_STDS).type_as(box_deltas) \ + torch.FloatTensor(cfg.TRAIN.COMMON.BBOX_NORMALIZE_MEANS).type_as(box_deltas) box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.COMMON.BBOX_NORMALIZE_STDS).type_as(box_deltas) \ + torch.FloatTensor(cfg.TRAIN.COMMON.BBOX_NORMALIZE_MEANS).type_as(box_deltas) box_deltas = box_deltas.view(1, -1, 4 * self.n_classes) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() thresh = 0 for j in xrange(1, self.n_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.COMMON.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] final_keep = torch.nonzero(cls_dets[:, -1] > cfg.TEST.COMMON.OBJ_DET_THRESHOLD).squeeze() result = cls_dets[final_keep] # unsqueeze result to 2 dims if result.numel()>0 and result.dim() == 1: result = result.unsqueeze(0) # in testing, concat object labels if final_keep.numel() > 0: if self.training: result = result[:,:4] else: result = torch.cat([result[:,:4], j * torch.ones(result.size(0),1).type_as(result)],1) if result.numel() > 0: results.append(result) if len(results): final = torch.cat(results, 0) else: final = torch.Tensor([]).type_as(rois) return final
def bld_train(args, ann_path=None, step=0): # print('Train from annotaion {}'.format(ann_path)) # print('Called with args:') # print(args) if args.use_tfboard: from model.utils.logger import Logger # Set the logger logger = Logger( os.path.join('./.logs', args.active_method, "/activestep" + str(step))) if args.dataset == "pascal_voc": args.imdb_name = "voc_2007_trainval" args.imdbval_name = "voc_2007_test" args.set_cfgs = [ 'ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '20' ] elif args.dataset == "pascal_voc_0712": args.imdb_name = "voc_2007_trainval+voc_2012_trainval" args.imdbval_name = "voc_2007_test" args.set_cfgs = [ 'ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '20' ] elif args.dataset == "coco": args.imdb_name = "coco_2014_train" args.imdbval_name = "coco_2014_minival" args.set_cfgs = [ 'ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '50' ] elif args.dataset == "imagenet": args.imdb_name = "imagenet_train" args.imdbval_name = "imagenet_val" args.set_cfgs = [ 'ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '30' ] elif args.dataset == "vg": # train sizes: train, smalltrain, minitrain # train scale: ['150-50-20', '150-50-50', '500-150-80', '750-250-150', '1750-700-450', '1600-400-20'] args.imdb_name = "vg_150-50-50_minitrain" args.imdbval_name = "vg_150-50-50_minival" args.set_cfgs = [ 'ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '50' ] elif args.dataset == "voc_coco": args.imdb_name = "voc_coco_2007_train+voc_coco_2007_val" args.imdbval_name = "voc_coco_2007_test" args.set_cfgs = [ 'ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]', 'MAX_NUM_GT_BOXES', '20' ] else: raise NotImplementedError args.cfg_file = "cfgs/{}_ls.yml".format( args.net) if args.large_scale else "cfgs/{}.yml".format(args.net) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) # print('Using config:') # pprint.pprint(cfg) # np.random.seed(cfg.RNG_SEED) # torch.backends.cudnn.benchmark = True if torch.cuda.is_available() and not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) # train set = source set + target set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = True cfg.USE_GPU_NMS = args.cuda # source train set, fully labeled #ann_path_source = os.path.join(ann_path, 'voc_coco_2007_train_f.json') #ann_path_target = os.path.join(ann_path, 'voc_coco_2007_train_l.json') imdb, roidb, ratio_list, ratio_index = combined_roidb( args.imdb_name, ann_path=os.path.join(ann_path, 'source')) imdb_tg, roidb_tg, ratio_list_tg, ratio_index_tg = combined_roidb( args.imdb_name, ann_path=os.path.join(ann_path, 'target')) print('{:d} roidb entries for source set'.format(len(roidb))) print('{:d} roidb entries for target set'.format(len(roidb_tg))) output_dir = args.save_dir + "/" + args.net + "/" + args.dataset + "/" + args.active_method + "/activestep" + str( step) if not os.path.exists(output_dir): os.makedirs(output_dir) sampler_batch_tg = None # do not sample target set bs_tg = 4 dataset_tg = roibatchLoader(roidb_tg, ratio_list_tg, ratio_index_tg, bs_tg, \ imdb_tg.num_classes, training=True) assert imdb.num_classes == imdb_tg.num_classes dataloader_tg = torch.utils.data.DataLoader(dataset_tg, batch_size=bs_tg, sampler=sampler_batch_tg, num_workers=args.num_workers, worker_init_fn=_rand_fn()) # initilize the tensor holder here. im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) image_label = torch.FloatTensor(1) confidence = torch.FloatTensor(1) # ship to cuda if args.cuda: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() image_label = image_label.cuda() confidence = confidence.cuda() # make variable im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) image_label = Variable(image_label) confidence = Variable(confidence) if args.cuda: cfg.CUDA = True # initialize the network here. if args.net == 'vgg16': fasterRCNN = vgg16(imdb.classes, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res101': fasterRCNN = resnet(imdb.classes, 101, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res50': fasterRCNN = resnet(imdb.classes, 50, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res152': fasterRCNN = resnet(imdb.classes, 152, pretrained=True, class_agnostic=args.class_agnostic) else: print("network is not defined") raise NotImplementedError # initialize the expectation network. if args.net == 'vgg16': fasterRCNN_val = vgg16(imdb.classes, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res101': fasterRCNN_val = resnet(imdb.classes, 101, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res50': fasterRCNN_val = resnet(imdb.classes, 50, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res152': fasterRCNN_val = resnet(imdb.classes, 152, pretrained=True, class_agnostic=args.class_agnostic) else: print("network is not defined") raise NotImplementedError fasterRCNN.create_architecture() fasterRCNN_val.create_architecture() # lr = cfg.TRAIN.LEARNING_RATE lr = args.lr # tr_momentum = cfg.TRAIN.MOMENTUM # tr_momentum = args.momentum params = [] for key, value in dict(fasterRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{'params': [value], 'lr': lr * (cfg.TRAIN.DOUBLE_BIAS + 1), \ 'weight_decay': cfg.TRAIN.BIAS_DECAY and cfg.TRAIN.WEIGHT_DECAY or 0}] else: params += [{ 'params': [value], 'lr': lr, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY }] if args.optimizer == "adam": lr = lr * 0.1 optimizer = torch.optim.Adam(params) elif args.optimizer == "sgd": optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) else: raise NotImplementedError if args.resume: load_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name) args.session = checkpoint['session'] args.start_epoch = checkpoint['epoch'] fasterRCNN.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr = optimizer.param_groups[0]['lr'] if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] print("loaded checkpoint %s" % (load_name)) # expectation model print("load checkpoint for expectation model: %s" % args.model_path) checkpoint = torch.load(args.model_path) fasterRCNN_val.load_state_dict(checkpoint['model']) if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] fasterRCNN_val = fasterRCNN_val fasterRCNN_val.eval() if args.mGPUs: fasterRCNN = nn.DataParallel(fasterRCNN) #fasterRCNN_val = nn.DataParallel(fasterRCNN_val) if args.cuda: fasterRCNN.cuda() fasterRCNN_val.cuda() # Evaluation # data_iter = iter(dataloader_tg) # for target_k in range( int(train_size_tg / args.batch_size)): fname = "noisy_annotations.pkl" if not os.path.isfile(fname): for batch_k, data in enumerate(dataloader_tg): im_data.data.resize_(data[0].size()).copy_(data[0]) im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) num_boxes.data.resize_(data[3].size()).copy_(data[3]) image_label.data.resize_(data[4].size()).copy_(data[4]) b_size = len(im_data) # expactation pass rois, cls_prob, bbox_pred, \ _, _, _, _, _ = fasterRCNN_val(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TRAIN.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(b_size, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() # print('DEBUG: Size of box_deltas is {}'.format(box_deltas.size()) ) box_deltas = box_deltas.view(b_size, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) # TODO: data distalliation # Choose the confident samples for b_idx in range(b_size): # fill one confidence # confidence.data[b_idx, :] = 1 - (gt_boxes.data[b_idx, :, 4] == 0) # resize prediction pred_boxes[b_idx] /= data[1][b_idx][2] for j in xrange(1, imdb.num_classes): if image_label.data[b_idx, j] != 1: continue # next if no image label # filtering box outside of the image not_keep = (pred_boxes[b_idx][:, j * 4] == pred_boxes[b_idx][:, j * 4 + 2]) | \ (pred_boxes[b_idx][:, j * 4 + 1] == pred_boxes[b_idx][:, j * 4 + 3]) keep = torch.nonzero(not_keep == 0).view(-1) # decease the number of pgts thresh = 0.5 while torch.nonzero( scores[b_idx, :, j][keep] > thresh).view(-1).numel() <= 0: thresh = thresh * 0.5 inds = torch.nonzero( scores[b_idx, :, j][keep] > thresh).view(-1) # if there is no det, error if inds.numel() <= 0: print('Warning!!!!!!! It should not appear!!') continue # find missing ID missing_list = np.where(gt_boxes.data[b_idx, :, 4] == 0)[0] if (len(missing_list) == 0): continue missing_id = missing_list[0] cls_scores = scores[b_idx, :, j][keep][inds] cls_boxes = pred_boxes[b_idx][keep][inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) keep = nms(cls_dets, 0.2) # Magic number ???? keep = keep.view(-1).tolist() sys.stdout.write( 'from {} predictions choose-> min({},4) as pseudo label \r' .format(len(cls_scores), len(keep))) sys.stdout.flush() _, order = torch.sort(cls_scores[keep], 0, True) if len(keep) == 0: continue max_keep = 4 for pgt_k in range(max_keep): if len(order) <= pgt_k: break if missing_id + pgt_k >= 20: break gt_boxes.data[b_idx, missing_id + pgt_k, :4] = cls_boxes[keep][order[ len(order) - 1 - pgt_k]] gt_boxes.data[b_idx, missing_id + pgt_k, 4] = j # class #confidence[b_idx, missing_id + pgt_k] = cls_scores[keep][order[len(order) - 1 - pgt_k]] num_boxes[b_idx] = num_boxes[b_idx] + 1 sample = roidb_tg[dataset_tg.ratio_index[batch_k * bs_tg + b_idx]] pgt_boxes = np.array([ gt_boxes[b_idx, x, :4].cpu().data.numpy() for x in range(int(num_boxes[b_idx])) ]) pgt_classes = np.array([ gt_boxes[b_idx, x, 4].cpu().data[0] for x in range(int(num_boxes[b_idx])) ]) sample["boxes"] = pgt_boxes sample["gt_classes"] = pgt_classes # DEBUG assert np.array_equal(sample["label"],image_label[b_idx].cpu().data.numpy()), \ "Image labels are not equal! {} vs {}".format(sample["label"],image_label[b_idx].cpu().data.numpy()) #with open(fname, 'w') as f: # pickle.dump(roidb_tg, f) else: pass # with open(fname) as f: # Python 3: open(..., 'rb') # roidb_tg = pickle.load(f) print("-- Optimization Stage --") # Optimization print("######################################################l") roidb.extend(roidb_tg) # merge two datasets print('before filtering, there are %d images...' % (len(roidb))) i = 0 while i < len(roidb): if True: if len(roidb[i]['boxes']) == 0: del roidb[i] i -= 1 else: if len(roidb[i]['boxes']) == 0: del roidb[i] i -= 1 i += 1 print('after filtering, there are %d images...' % (len(roidb))) from roi_data_layer.roidb import rank_roidb_ratio ratio_list, ratio_index = rank_roidb_ratio(roidb) train_size = len(roidb) sampler_batch = sampler(train_size, args.batch_size) dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ imdb.num_classes, training=True) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, sampler=sampler_batch, num_workers=args.num_workers, worker_init_fn=_rand_fn()) iters_per_epoch = int(train_size / args.batch_size) print("Training set size is {}".format(train_size)) for epoch in range(args.start_epoch, args.max_epochs + 1): fasterRCNN.train() loss_temp = 0 start = time.time() epoch_start = start # adjust learning rate if epoch % (args.lr_decay_step + 1) == 0: adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma # one step data_iter = iter(dataloader) for step in range(iters_per_epoch): data = next(data_iter) im_data.data.resize_(data[0].size()).copy_(data[0]) im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) num_boxes.data.resize_(data[3].size()).copy_(data[3]) image_label.data.resize_(data[4].size()).copy_(data[4]) #gt_boxes.data = \ # torch.cat((gt_boxes.data, torch.zeros(gt_boxes.size(0), gt_boxes.size(1), 1).cuda()), dim=2) conf_data = torch.zeros(gt_boxes.size(0), gt_boxes.size(1)).cuda() confidence.data.resize_(conf_data.size()).copy_(conf_data) fasterRCNN.zero_grad() # rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes, confidence) rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) # rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes, confidence) loss = rpn_loss_cls.mean() + rpn_loss_box.mean() \ + RCNN_loss_cls.mean() + RCNN_loss_bbox.mean() loss_temp += loss.data[0] # backward optimizer.zero_grad() loss.backward() if args.net == "vgg16": clip_gradient(fasterRCNN, 10.) optimizer.step() if step % args.disp_interval == 0: end = time.time() if step > 0: loss_temp /= args.disp_interval if args.mGPUs: loss_rpn_cls = rpn_loss_cls.mean().data[0] loss_rpn_box = rpn_loss_box.mean().data[0] loss_rcnn_cls = RCNN_loss_cls.mean().data[0] loss_rcnn_box = RCNN_loss_bbox.mean().data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt else: loss_rpn_cls = rpn_loss_cls.data[0] loss_rpn_box = rpn_loss_box.data[0] loss_rcnn_cls = RCNN_loss_cls.data[0] loss_rcnn_box = RCNN_loss_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[session %d][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \ % (args.session, epoch, step, iters_per_epoch, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) if args.use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } for tag, value in info.items(): logger.scalar_summary(tag, value, step) images = [] for k in range(args.batch_size): image = draw_bounding_boxes( im_data[k].data.cpu().numpy(), gt_boxes[k].data.cpu().numpy(), im_info[k].data.cpu().numpy(), num_boxes[k].data.cpu().numpy()) images.append(image) logger.image_summary("Train epoch %2d, iter %4d/%4d" % (epoch, step, iters_per_epoch), \ images, step) loss_temp = 0 start = time.time() if False: break if args.mGPUs: save_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) else: save_name = os.path.join( output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name)) epoch_end = time.time() print('Epoch time cost: {}'.format(epoch_end - epoch_start)) print('finished!')
def forward(self, im_data, im_info, gt_boxes, num_boxes): batch_size = im_data.size(0) im_info = im_info.data gt_boxes = gt_boxes.data num_boxes = num_boxes.data rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, \ RCNN_loss_cls, RCNN_loss_bbox, rois_label \ = self.FRCN(im_data, im_info, gt_boxes, num_boxes) # get global and local region from Faster R-CNN base_feat = self.FRCN.RCNN_base(im_data) #print(rois.data.cpu().numpy()) scores = cls_prob.data boxes = rois.data[:, :, 1:5] box_deltas = self.FRCN._bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: if self.class_agnostic: if self.use_cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda( ) + torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS) * torch.FlaotTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if self.use_cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda( ) + torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torhc.FlaotTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS) + torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(self.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() # get global region thresh = 0.00 region_g = np.ndarray((0, 5)) region_l = np.ndarray((0, 5)) for j in range(1, 4): inds = torch.nonzero(scores[:, j] >= thresh).view(-1) inds_l = torch.nonzero(scores[:, j + 3] >= thresh).view(-1) #print(inds) if inds.numel() > 0 and inds_l.numel() > 0: cls_scores = scores[:, j][inds] cls_scores_l = scores[:, j + 3][inds_l] #print(cls_scores) #print(cls_scores_l) _, order = torch.sort(cls_scores, 0, True) _, order_l = torch.sort(cls_scores_l, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds] cls_boxes_l = pred_boxes[inds_l] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_boxes_l = pred_boxes[inds_l][:, (j + 3) * 4:(j + 4) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) cls_dets_l = torch.cat( (cls_boxes_l, cls_scores_l.unsqueeze(1)), 1) cls_dets = cls_dets[order] cls_dets_l = cls_dets_l[order_l] region_g = np.vstack( (region_g, cls_dets[np.argmax(cls_dets[..., -1])])) region_l = np.vstack( (region_l, cls_dets_l[np.argmax(cls_dets_l[..., -1])])) #print(cls_dets) #print(pred_boxes) # if true, then show detection global and local region if True: print(region_g) print(region_l) im = im_data.cpu().numpy()[0] im = np.transpose(im, (1, 2, 0))[..., ::-1] im -= im.min() im /= im.max() plt.imshow(im.astype(np.float)) ax = plt.axes() ax.add_patch( plt.Rectangle((region_g[0, 0], region_g[0, 1]), region_g[0, 2] - region_g[0, 0], region_g[0, 3] - region_g[0, 1], fill=False, edgecolor='red', linewidth=1)) ax.add_patch( plt.Rectangle((region_l[0, 0], region_l[0, 1]), region_l[0, 2] - region_l[0, 0], region_l[0, 3] - region_l[0, 1], fill=False, edgecolor='yellow', linewidth=1)) plt.show() rois_g = np.zeros((1, 1, 5), dtype=np.float32) rois_g[0, 0, 1:5] = region_g[0, :4] / 16. rois_l = np.zeros((1, 1, 5), dtype=np.float32) rois_l[0, 0, 1:5] = region_l[0, :4] / 16. GPU = 0 rois_g = torch.tensor(rois_g, dtype=torch.float).to(GPU) rois_l = torch.tensor(rois_l, dtype=torch.float).to(GPU) # global region if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois_g.view(-1, 5), base_feat.size()[2:], self.FRCN.grid_size) grid_yx = torch.stack([grid_xy.data[..., 1], grid_xy.data[..., 0]], 3).contiguous() pooled_feat_g = self.FRCN.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_g = F.max_pool2d(pooled_feat_g, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_g = self.RCNN_roi_align(base_feat, rois_g.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_g = self.RCNN_roi_pool(base_feat, rois_g.view(-1, 5)) # local region if cfg.POOLING_MODE == 'crop': grid_xy = _affine_grid_gen(rois_l.view(-1, 5), base_feat.size()[2:], self.FRCN.grid_size) grid_yx = torch.stack([grid_xy.data[..., 1], grid_xy.data[..., 0]], 3).contiguous() pooled_feat_l = self.FRCN.RCNN_roi_crop(base_feat, Variable(grid_yx).detach()) if cfg.CROP_RESIZE_WITH_MAX_POOL: pooled_feat_l = F.max_pool2d(pooled_feat_l, 2, 2) elif cfg.POOLING_MODE == 'align': pooled_feat_l = self.RCNN_roi_align(base_feat, rois_l.view(-1, 5)) elif cfg.POOLING_MODE == 'pool': pooled_feat_l = self.RCNN_roi_pool(base_feat, rois_l.view(-1, 5)) #print(pooled_feat_g.cpu().detach().numpy().shape) x = torch.cat((pooled_feat_g, pooled_feat_l), dim=1) #print(x.cpu().detach().numpy().shape) x = self.glcc_conv1(x) x = F.relu(x) x = x.view(-1, self.roipool * self.roipool * 512) x = self.glcc_fc1(x) s = F.relu(x) x = nn.Dropout2d()(x) x = self.glcc_fc2(x) x = F.relu(x) x = nn.Dropout2d()(x) x = self.glcc_fc_out(x) return rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_bbox, RCNN_loss_cls, RCNN_loss_bbox, rois_label, x
def object_detection(im, result): blobs, im_scales = _get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.data.resize_(1, 1, 5).zero_() num_boxes.data.resize_(1).zero_() # pdb.set_trace() det_tic = time.time() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] result_box = [] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im2show = np.copy(result) for j in xrange(1, len(pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] result_box.append([cls_dets.cpu(), j]) if vis: im2show = vis_detections(im2show, pascal_classes[j], cls_dets.cpu().numpy(), 0.5) misc_toc = time.time() nms_time = misc_toc - misc_tic # sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ # .format(i + 1, num_images, detect_time, nms_time)) # sys.stdout.flush() return result_box, im2show
def pose_est(self, im_in): # rgb -> bgr im = im_in[:, :, ::-1] # # all items in the array, reversed blobs, im_scales = _get_image_blob(im) im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) with torch.no_grad(): self.im_data.resize_(im_data_pt.size()).copy_(im_data_pt) self.im_info.resize_(im_info_pt.size()).copy_(im_info_pt) self.gt_boxes.resize_(1, 1, 6).zero_() self.num_boxes.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label, ps_prob, RCNN_loss_ps, rois_pose = self.fasterRCNN(self.im_data, self.im_info, self.gt_boxes, self.num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] scores_ps = ps_prob.data if cfg.TEST.BBOX_REG: # Test using bounding-box regressors, True # Apply bounding-box regression deltas box_deltas = bbox_pred.data # (1, 300, 4) # True, set in lib/model/utils if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: # our case if self.cuda > 0: # (300, 4) # BBOX_NORMALIZE_STDS=(0.1, 0.1, 0.2, 0.2), BBOX_NORMALIZE_MEANS=(0.0, 0.0, 0.0, 0.0) box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) # (1, 300, 4) # boxes: RoIs output from RPN, in image coordinates pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) # Clip boxes to image boundaries pred_boxes = clip_boxes(pred_boxes, self.im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] # im_scales[0] = 1.25 # (1,300,5) --> (300,5). 5: classes scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() scores_ps = scores_ps.squeeze() if self.vis: # self.vis = True im2show = np.copy(im) daset_classes = self.grasp_classes pose_lists = [] pose_highest_lists = [] pose_highest = [] # # start from ind 1, ignore the bg cls for j in range(1, len(daset_classes)): inds = torch.nonzero(scores[:, j] > self.thresh).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] ps_scores = scores_ps[inds] ps_scores_max_values, ps_scores_inds = torch.max(ps_scores, 1) _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1), ps_scores_max_values.unsqueeze(1), ps_scores_inds.unsqueeze(1).float()), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if self.vis: # only show bboxes having class score > 0.5 im2show, pose_list = vis_detections(im2show, daset_classes[j], cls_dets.cpu().numpy(), thresh=0.5) # pose_list: list of all bboxes for each class if len(pose_list): if len(pose_list) > 1: # sort all bboxes of 1 class according to angle score pose_list.sort(key=itemgetter(5), reverse=True) pose_lists.append(pose_list) #print('pose_lists: {}'.format(pose_lists)) # only keep the bbox having the highest angle score of each class pose_highest_lists.append(pose_list[0]) print('pose_highest_lists: {}'.format( pose_highest_lists)) if len(pose_highest_lists) > 1: # sort all highest bboxes of all classes, according to angle score pose_highest_lists.sort(key=itemgetter(5), reverse=True) # get the highest angle core bbox accross all classes, bboxes pose_highest = pose_highest_lists[0] print('pose_highest: {}'.format(pose_highest)) im2show_copy = np.copy(im2show) im2show_copy = im2show_copy.astype(np.uint8) im2showRGB = cv2.cvtColor(im2show_copy, cv2.COLOR_BGR2RGB) return pose_highest, im2show_copy, im2showRGB
def frcnn(train): args = parse_args() print('Called with args:') print(args) if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) from model.utils.config import cfg cfg.USE_GPU_NMS = args.cuda print('Using config:') pprint.pprint(cfg) np.random.seed(cfg.RNG_SEED) # train set # -- Note: Use validation set and disable the flipped to enable faster loading. input_dir = args.load_dir + "/" + args.net + "/" + args.dataset if not os.path.exists(input_dir): raise Exception( 'There is no input directory for loading network from ' + input_dir) load_name = os.path.join( input_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) pascal_classes = np.asarray([ '___background__', u'person', u'bicycle', u'car', u'motorcycle', u'airplane', u'bus', u'train', u'truck', u'boat', u'traffic light', u'fire hydrant', u'stop sign', u'parking meter', u'bench', u'bird', u'cat', u'dog', u'horse', u'sheep', u'cow', u'elephant', u'bear', u'zebra', u'giraffe', u'backpack', u'umbrella', u'handbag', u'tie', u'suitcase', u'frisbee', u'skis', u'snowboard', u'sports ball', u'kite', u'baseball bat', u'baseball glove', u'skateboard', u'surfboard', u'tennis racket', u'bottle', u'wine glass', u'cup', u'fork', u'knife', u'spoon', u'bowl', u'banana', u'apple', u'sandwich', u'orange', u'broccoli', u'carrot', u'hot dog', u'pizza', u'donut', u'cake', u'chair', u'couch', u'potted plant', u'bed', u'dining table', u'toilet', u'tv', u'laptop', u'mouse', u'remote', u'keyboard', u'cell phone', u'microwave', u'oven', u'toaster', u'sink', u'refrigerator', u'book', u'clock', u'vase', u'scissors', u'teddy bear', u'hair drier', u'toothbrush' ]) # initilize the network here. #args.imdb_name = "coco_2014_train+coco_2014_valminusminival" # imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdb_name) if args.net == 'vgg16': fasterRCNN = vgg16(pascal_classes, pretrained=True, class_agnostic=args.class_agnostic) elif args.net == 'res101': fasterRCNN = resnet(pascal_classes, 101, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res50': fasterRCNN = resnet(pascal_classes, 50, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res152': fasterRCNN = resnet(pascal_classes, 152, pretrained=False, class_agnostic=args.class_agnostic) else: print("network is not defined") pdb.set_trace() fasterRCNN.create_architecture() print("load checkpoint %s" % (load_name)) if args.cuda > 0: checkpoint = torch.load(load_name) else: checkpoint = torch.load(load_name, map_location=(lambda storage, loc: storage)) fasterRCNN.load_state_dict(checkpoint['model']) if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] print('load model successfully!') # pdb.set_trace() print("load checkpoint %s" % (load_name)) # initilize the tensor holder here. im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) # ship to cuda if args.cuda > 0: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() # make variable with torch.no_grad(): im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) if args.cuda > 0: cfg.CUDA = True if args.cuda > 0: fasterRCNN.cuda() fasterRCNN.eval() thresh = 0.5 webcam_num = args.webcam_num imglist = os.listdir(args.image_dir) num_images = len(imglist) print('Loaded Photo: {} images.'.format(num_images)) import json, re from tqdm import tqdm d = {} pbar = tqdm(imglist) if not train: for i in pbar: im_file = os.path.join(args.image_dir, i) # im = cv2.imread(im_file) im_name = i im_in = np.array(imread(im_file)) if len(im_in.shape) == 2: im_in = im_in[:, :, np.newaxis] im_in = np.concatenate((im_in, im_in, im_in), axis=2) # rgb -> bgr im = im_in[:, :, ::-1] blobs, im_scales = _get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.data.resize_(1, 1, 5).zero_() num_boxes.data.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: if args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: #Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() lis = json.load( open( '/home/nesa320/huangshicheng/gitforwork/gsnn/graph/labels.json', 'r')) sm_lis = np.zeros(len(lis)) for j in xrange(1, len(pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) #cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] score = cls_dets[0][-1] try: sm_lis[lis.index(pascal_classes[j])] = score.numpy() except: pass d[re.sub("\D", "", im_name)] = sm_lis.tolist() json.dump(d, open('annotation_dict' + '.json', 'w'), indent=2) else: pass
torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() dim_orien = dim_orien * \ torch.FloatTensor(cfg.TRAIN.DIM_NORMALIZE_STDS).cuda() + \ torch.FloatTensor(cfg.TRAIN.DIM_NORMALIZE_MEANS).cuda() box_delta_left = box_delta_left.view(1, -1, 4 * len(imdb._classes)) box_delta_right = box_delta_right.view(1, -1, 4 * len(imdb._classes)) dim_orien = dim_orien.view(1, -1, 5 * len(imdb._classes)) kpts_delta = kpts_delta.view(1, -1, 1) left_delta = left_delta.view(1, -1, 1) right_delta = right_delta.view(1, -1, 1) max_prob = max_prob.view(1, -1, 1) pred_boxes_left = \ bbox_transform_inv(boxes_left, box_delta_left, 1) pred_boxes_right = \ bbox_transform_inv(boxes_right, box_delta_right, 1) pred_kpts, kpts_type = \ kpts_transform_inv(boxes_left, kpts_delta, cfg.KPTS_GRID) pred_left = \ border_transform_inv(boxes_left, left_delta, cfg.KPTS_GRID) pred_right = \ border_transform_inv(boxes_left, right_delta, cfg.KPTS_GRID) pred_boxes_left = clip_boxes(pred_boxes_left, im_info.data, 1) pred_boxes_right = clip_boxes(pred_boxes_right, im_info.data, 1) pred_boxes_left /= im_info[0, 2].data pred_boxes_right /= im_info[0, 2].data pred_kpts /= im_info[0, 2].data
def detect(image, threshold=0.5, max_bbox=20): thresh = 0.05 vis = True result = [] # rgb -> bgr im = image[:, :, ::-1] blobs, im_scales = _get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.data.resize_(1, 1, 5).zero_() num_boxes.data.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() for j in xrange(1, len(pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: dets = cls_dets.cpu().numpy() for i in range(np.minimum(max_bbox, dets.shape[0])): bbox = tuple(int(np.round(x)) for x in dets[i, :4]) score = dets[i, -1] if score > threshold: ymin, xmin, ymax, xmax = bbox result.append(score) result.append(int(xmin)) result.append(int(ymin)) result.append(int(xmax)) result.append(int(ymax)) result.append(pascal_classes[j]) return result
def Predict(self, im_in, area): # initilize the tensor holder here. im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) # ship to cuda if self.cuda > 0: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() # make variable with torch.no_grad(): im_data = Variable(im_data) im_info = Variable(im_info) num_boxes = Variable(num_boxes) gt_boxes = Variable(gt_boxes) if self.cuda > 0: cfg.CUDA = True if self.cuda > 0: self.fasterRCNN.cuda() self.fasterRCNN.eval() #im_in = cv2.imread(im_file) if len(im_in.shape) == 2: im_in = im_in[:, :, np.newaxis] im_in = np.concatenate((im_in, im_in, im_in), axis=2) # rgb -> bgr im_in = im_in[:, :, ::-1] im = cv2.cvtColor(im_in, cv2.COLOR_BGR2RGB) blobs, im_scales = self._get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.data.resize_(1, 1, 5).zero_() num_boxes.data.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = self.fasterRCNN(im_data, im_info, gt_boxes, num_boxes) scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: if self.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if self.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(self.pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class _ = torch.from_numpy(np.tile(boxes, (1, scores.shape[1]))) pred_boxes = _.cuda() if self.cuda > 0 else _ pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() ItemAndBoxes_all = [] im2show = np.copy(im) for j in xrange(1, len(self.pascal_classes)): inds = torch.nonzero(scores[:, j] > self.thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS) cls_dets = cls_dets[keep.view(-1).long()] im2show, ItemAndBoxes = vis_detections(im2show, self.pascal_classes[j], cls_dets.cpu().numpy(), self.visThresh) ItemAndBoxes_all.append(ItemAndBoxes) ItemAndBoxes_all = sorted(ItemAndBoxes_all, key=lambda x: x[2], reverse=True) ItemAndBoxes_all = ItemAndBoxes_all[0:3] ItemAndBoxes_all = sorted(ItemAndBoxes_all, key=lambda x: x[1][0]) if self.vis == 1: cv2.namedWindow("result", 0) cv2.resizeWindow("result", 1080, 720) cv2.imshow('result', im2show) cv2.waitKey(0) result_path = os.path.join(self.image_dir, str(area) + ".jpg") cv2.imwrite(result_path, im2show) return { "Left": ItemAndBoxes_all[0][0], "Mid": ItemAndBoxes_all[1][0], "Right": ItemAndBoxes_all[2][0] }
def objdet_inference(cls_prob, box_output, im_info, box_prior=None, class_agnostic=True, n_classes=None, for_vis=False, recover_imscale=True): """ :param cls_prob: predicted class info :param box_output: predicted bounding boxes (for anchor-based detection, it indicates deltas of boxes). :param im_info: image scale information, for recovering the original bounding box scale before image resizing. :param box_prior: anchors, RoIs, e.g. :param class_agnostic: whether the boxes are class-agnostic. For faster RCNN, it is class-specific by default. :param n_classes: number of object classes :param for_vis: the results are for visualization or validation. :param recover_imscale: whether the predicted bounding boxes are recovered to the original scale. :return: a list of bounding boxes, one class corresponding to one element. If for_vis, they will be concatenated. """ assert box_output.dim( ) == 2, "Multi-instance batch inference has not been implemented." if for_vis: thresh = cfg.TEST.COMMON.OBJ_DET_THRESHOLD else: thresh = 0. scores = cls_prob # TODO: Inference for anchor free algorithms has not been implemented. if box_prior is None: raise NotImplementedError( "Inference for anchor free algorithms has not been implemented.") if cfg.TRAIN.COMMON.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: normalizer = { 'mean': cfg.TRAIN.COMMON.BBOX_NORMALIZE_MEANS, 'std': cfg.TRAIN.COMMON.BBOX_NORMALIZE_STDS } box_output = box_unnorm_torch(box_output, normalizer, 4, class_agnostic, n_classes) else: raise RuntimeError( "BBOX_NORMALIZE_TARGETS_PRECOMPUTED is forced to be True in our version." ) pred_boxes = bbox_transform_inv(box_prior, box_output, 1) pred_boxes = clip_boxes(pred_boxes, im_info, 1) scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() if recover_imscale: pred_boxes = box_recover_scale_torch(pred_boxes, im_info[3], im_info[2]) all_box = [[]] for j in xrange(1, n_classes): if class_agnostic: cls_boxes = pred_boxes else: cls_boxes = pred_boxes[:, j * 4:(j + 1) * 4] cls_dets, cls_scores, _ = box_filter(cls_boxes, scores[:, j], thresh, use_nms=True) cls_dets = np.concatenate((cls_dets, np.expand_dims(cls_scores, -1)), axis=-1) if for_vis: cls_dets[:, -1] = j all_box.append(cls_dets) if for_vis: return np.concatenate(all_box[1:], axis=0) return all_box
def test_net(model=None, image=None, params=None, bg=None, cls=None): blob, scale, label = params with torch.no_grad(): # pre-processing data for passing net im_data = Variable(torch.FloatTensor(1).cuda()) im_info = Variable(torch.FloatTensor(1).cuda()) num_boxes = Variable(torch.LongTensor(1).cuda()) gt_boxes = Variable(torch.FloatTensor(1).cuda()) im_info_np = np.array([[blob.shape[1], blob.shape[2], scale[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) with torch.no_grad(): # resize im_data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.resize_(1, 1, 5).zero_() num_boxes.resize_(1).zero_() rois, cls_prob, bbox_pred, \ rpn_loss_cls, rpn_loss_box, \ RCNN_loss_cls, RCNN_loss_bbox, \ rois_label = model(im_data, im_info, gt_boxes, num_boxes) # predict scores = cls_prob.data boxes = rois.data[:, :, 1:5] if opt.TEST_BBOX_REG: box_deltas = bbox_pred.data if opt.TRAIN_BBOX_NORMALIZE_TARGETS_PRECOMPUTED: if opt.cuda: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(opt.TRAIN_BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(opt.TRAIN_BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(opt.TRAIN_BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(opt.TRAIN_BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(label)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) pred_boxes /= scale[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() image = np.copy(image[:, :, ::-1]) demo = image.copy() bubbles = [] dets_bubbles = [] for j in range(1, len(label)): inds = torch.nonzero(scores[:, j] > opt.THRESH).view(-1) if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], opt.TEST_NMS) cls_dets = cls_dets[keep.view(-1).long()].cpu().numpy() # post-processing : get contours of speech bubble demo, image, bubbles, dets_bubbles = bubble_utils.get_cnt_bubble(image, image.copy(), label[j], cls_dets, cls, bg=bg) return demo, image, bubbles, dets_bubbles
def forward(self, input): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs scores = input[0][:, :, 1] # batch_size x num_rois x 1 bbox_deltas = input[1] # batch_size x num_rois x 4 im_info = input[2] cfg_key = input[3] feat_shapes = input[4] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE batch_size = bbox_deltas.size(0) anchors = torch.from_numpy( generate_anchors_all_pyramids( self._fpn_scales, self._anchor_ratios, feat_shapes, self._fpn_feature_strides, self._fpn_anchor_stride)).type_as(scores) num_anchors = anchors.size(0) anchors = anchors.view(1, num_anchors, 4).expand(batch_size, num_anchors, 4) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # keep_idx = self._filter_boxes(proposals, min_size).squeeze().long().nonzero().squeeze() scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(proposals_single, scores_single.squeeze(1), nms_thresh) # keep_idx_i = nms(proposals_single, scores_single, nms_thresh) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output
def detect(self, cv_img, is_rgb=True): # - image shape is (height,width,no_channels) # print('- input image shape: {}'.format(cv_img.shape)) # - result is a list of [x1,y1,x2,y2,class_id] results = [] im_in = np.array(cv_img) if is_rgb: im = im_in[:, :, ::-1] # rgb -> bgr else: im = im_in blobs, im_scales = self._get_image_blob( im) # prep_type = 'caffe' is applied assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) with torch.no_grad(): self.im_data.resize_(im_data_pt.size()).copy_(im_data_pt) self.im_info.resize_(im_info_pt.size()).copy_(im_info_pt) self.gt_boxes.resize_(1, 1, 5).zero_() self.num_boxes.resize_(1).zero_() # pdb.set_trace() det_tic = time.time() rois0, cls_prob0, bbox_pred0, _, _, _, _, _, share_pred0, _, progress_pred0, _ = \ self.fasterRCNN(self.im_data, self.im_info, self.gt_boxes, self.num_boxes, flow_id=0) rois1, cls_prob1, bbox_pred1, _, _, _, _, _, share_pred1, _, _, _ = \ self.fasterRCNN(self.im_data, self.im_info, self.gt_boxes, self.num_boxes, flow_id=1) # rois0 # [1, 300, 5] rois = torch.cat((rois0, rois1), dim=1) # share_pred0 # [1, 300, 1] aaaa = torch.ones( (share_pred0.shape[0], rois0.shape[1], 1)).cuda() * share_pred1 share_pred = torch.cat((share_pred0, aaaa), dim=1) progress_pred = progress_pred0 # cls_prob0 # [1, 300, 3] # bbox_pred0 # [1, 300, 12 (3x4) cls_prob = torch.zeros( (cls_prob0.shape[0], cls_prob0.shape[1] + cls_prob1.shape[1], len(self.classes_total))).cuda() bbox_pred = torch.zeros( (bbox_pred0.shape[0], bbox_pred0.shape[1] + bbox_pred1.shape[1], 4 * len(self.classes_total))).cuda() for j, j_name in enumerate(self.classes_total): if j_name in self.classes0: j_idx = (j_name == self.classes0).nonzero()[0][0] num_batch0 = cls_prob0.shape[1] cls_prob[:, :num_batch0, j] = cls_prob0[:, :, j_idx] bbox_pred[:, :num_batch0, j * 4:(j + 1) * 4] = bbox_pred0[:, :, j_idx * 4:(j_idx + 1) * 4] if j_name in self.classes1: j_idx = (j_name == self.classes1).nonzero()[0][0] num_batch1 = cls_prob1.shape[1] cls_prob[:, num_batch0:num_batch0 + num_batch1, j] = cls_prob1[:, :, j_idx] bbox_pred[:, num_batch0:num_batch0 + num_batch1, j * 4:(j + 1) * 4] = bbox_pred1[:, :, j_idx * 4:(j_idx + 1) * 4] scores = cls_prob.data boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if self.class_agnostic: if self.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if self.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(self.classes_total)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, self.im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] if self.use_share_regress: share_pred = share_pred.squeeze() scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if self.vis or self.save_result: h, w = im.shape[:2] if max(h, w) > 800: if h > 800: im2show = cv2.resize(im, (int(800 / h * w), 800)) if w > 800: im2show = cv2.resize(im, (800, int(800 / w * h))) h_display = im2show.shape[0] im_scale = h_display / h else: im2show = np.copy(im) im_scale = 1.0 im_pil = torchvision.transforms.ToPILImage(mode=None)(im[:, :, ::-1]) im_width, im_height = im_pil.size # for j in range(1, len(self.classes_total)): # inds = torch.nonzero(scores[:, j] > self.thresh).view(-1) # find index with scores > threshold in j-class # # if there is det # if inds.numel() > 0: # cls_scores = scores[:, j][inds] # if self.use_share_regress: # share_pred_inds = share_pred[inds] # # _, order = torch.sort(cls_scores, 0, True) # if self.class_agnostic: # cls_boxes = pred_boxes[inds, :] # else: # cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] # # if self.use_share_regress: # cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1), share_pred_inds.unsqueeze(1)), 1) # else: # cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # # cls_dets = torch.cat((cls_boxes, cls_scores), 1) # cls_dets = cls_dets[order] # keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) # cls_dets = cls_dets[keep.view(-1).long()] # # # im: original image, (768, 1024, 3) # # im_data: blob image, (1, 3, 600, 800) # # cls_dets: x1, y1, x2, y2, score # # # crop and feed to classifier # # im_pil.save(osp.join(pathOutputSaveImages, 'debug_input.png')) # if self.classes_total[j] == 'food': # for k in range(cls_dets.shape[0]): # crop_margin_ratio = 0.1 # # x1 = int(cls_dets[k, 0]) # y1 = int(cls_dets[k, 1]) # x2 = int(cls_dets[k, 2]) # y2 = int(cls_dets[k, 3]) # # crop_h_margin = (y2 - y1) * crop_margin_ratio/2. # crop_w_margin = (x2 - x1) * crop_margin_ratio/2. # # x1 = x1 - crop_w_margin # y1 = y1 - crop_h_margin # x2 = x2 + crop_w_margin # y2 = y2 + crop_h_margin # # if x1 < 0: x1 = 0 # if y1 < 0: y1 = 0 # if x2 > im_width-1: x2 = im_width-1 # if y2 > im_height-1: y2 = im_height-1 # # im_crop = im_pil.crop((x1, y1, x2, y2)) # # im_crop.save(osp.join(pathOutputSaveImages, 'debug_crop.png')) # # im_crop = self.food_classifier.test_transform(im_crop) # im_crop = torch.unsqueeze(im_crop, dim=0) # # if self.food_classifier.eval_crop_type == 'TenCrop': # bs, ncrops, c, h, w = im_crop.size() # im_crop = im_crop.view(-1, c, h, w) # # food_output = self.food_classifier.classify(im_crop) # # if self.food_classifier.eval_crop_type == 'TenCrop': # food_output = food_output.view(bs, ncrops, -1).mean(1) # avg over crops # # topk_score, topk_index = torch.topk(food_output, 5, dim=1) # # food_class = [self.food_classifier.idx_to_class[topk_index[0][l].item()] for l in range(5)] # food_score = torch.nn.functional.softmax(topk_score[0], dim=0) # # if self.vis or self.save_result: # bbox_draw = cls_dets.detach().cpu().numpy()[k:k + 1, :] # bbox_draw[:, :4] = bbox_draw[:, :4] * im_scale # # # - result is a list of [x1,y1,x2,y2,class_id] # results.append([int(bbox_draw[0][0]), int(bbox_draw[0][1]), int(bbox_draw[0][2]), int(bbox_draw[0][3]), # self.classes_total[j], # topk_index[0][0].item(), # food_class[0], # bbox_draw[0][5].item()]) # # # class_name_w_food = '%s (%s: %.2f)'%(pascal_classes[j], food_class[0], food_score[0].item()) # class_name_w_food = '%s (%s)'%(self.classes_total[j], food_class[0]) # im2show = vis_detections_korean_ext2_wShare(im2show, class_name_w_food, bbox_draw, # box_color=self.list_box_color[j], text_color=(255, 255, 255), # text_bg_color=self.list_box_color[j], fontsize=20, thresh=self.vis_th, # draw_score=False, draw_text_out_of_box=True) # else: # if self.vis or self.save_result: # bbox_draw = cls_dets.detach().cpu().numpy() # bbox_draw[:, :4] = bbox_draw[:, :4] * im_scale # # results.append([int(bbox_draw[0][0]), int(bbox_draw[0][1]), int(bbox_draw[0][2]), int(bbox_draw[0][3]), # self.classes_total[j], # 0, # 0, # 0]) # # im2show = vis_detections_korean_ext2(im2show, self.classes_total[j], bbox_draw, # box_color=self.list_box_color[j], text_color=(255, 255, 255), # text_bg_color=self.list_box_color[j], fontsize=20, thresh=self.vis_th, # draw_score=False, draw_text_out_of_box=True) for j in range(1, len(self.classes_total)): inds = torch.nonzero(scores[:, j] > self.thresh).view( -1) # find index with scores > threshold in j-class # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] if self.use_share_regress: share_pred_inds = share_pred[inds] _, order = torch.sort(cls_scores, 0, True) if self.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] if self.use_share_regress: cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1), share_pred_inds.unsqueeze(1)), 1) else: cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] # im: original image, (768, 1024, 3) # im_data: blob image, (1, 3, 600, 800) # cls_dets: x1, y1, x2, y2, score # crop and feed to classifier # im_pil.save(osp.join(pathOutputSaveImages, 'debug_input.png')) if self.classes_total[j] == 'food': for k in range(cls_dets.shape[0]): crop_margin_ratio = 0.1 x1 = int(cls_dets[k, 0]) y1 = int(cls_dets[k, 1]) x2 = int(cls_dets[k, 2]) y2 = int(cls_dets[k, 3]) crop_h_margin = (y2 - y1) * crop_margin_ratio / 2. crop_w_margin = (x2 - x1) * crop_margin_ratio / 2. x1 = x1 - crop_w_margin y1 = y1 - crop_h_margin x2 = x2 + crop_w_margin y2 = y2 + crop_h_margin if x1 < 0: x1 = 0 if y1 < 0: y1 = 0 if x2 > im_width - 1: x2 = im_width - 1 if y2 > im_height - 1: y2 = im_height - 1 im_crop = im_pil.crop((x1, y1, x2, y2)) # im_crop.save(osp.join(pathOutputSaveImages, 'debug_crop.png')) im_crop = self.food_classifier.test_transform(im_crop) im_crop = torch.unsqueeze(im_crop, dim=0) if self.food_classifier.eval_crop_type == 'TenCrop': bs, ncrops, c, h, w = im_crop.size() im_crop = im_crop.view(-1, c, h, w) food_output = self.food_classifier.classify(im_crop) if self.food_classifier.eval_crop_type == 'TenCrop': food_output = food_output.view( bs, ncrops, -1).mean(1) # avg over crops topk_score, topk_index = torch.topk(food_output, 5, dim=1) food_class = [ self.food_classifier.idx_to_class[ topk_index[0][l].item()] for l in range(5) ] food_score = torch.nn.functional.softmax(topk_score[0], dim=0) bbox_draw = cls_dets.detach().cpu().numpy()[k:k + 1, :] bbox_draw[:, :4] = bbox_draw[:, :4] * im_scale box_y = (bbox_draw[0, 1] + bbox_draw[0, 3]) / 2. if bbox_draw[ 0, 4] >= self.vis_th and box_y > im2show.shape[ 0] / 2: # - result is a list of [x1,y1,x2,y2,class_id] results.append([ int(bbox_draw[0][0]), int(bbox_draw[0][1]), int(bbox_draw[0][2]), int(bbox_draw[0][3]), self.classes_total[j], topk_index[0][0].item(), # food_class index food_class[0], bbox_draw[0][5].item() ]) else: bbox_draw = cls_dets.detach().cpu().numpy() bbox_draw[:, :4] = bbox_draw[:, :4] * im_scale for k in range(cls_dets.shape[0]): box_y = (bbox_draw[k, 1] + bbox_draw[k, 3]) / 2. if bbox_draw[ k, 4] >= self.vis_th and box_y > im2show.shape[ 0] / 2: results.append([ int(bbox_draw[k][0]), int(bbox_draw[k][1]), int(bbox_draw[k][2]), int(bbox_draw[k][3]), self.classes_total[j], 0, 0, 0 ]) # dish-food converter # every dish find the food and its amount # if food is not found, zero amount is assigned. print('0.results:', results) new_results = [] for item in results: x1, y1, x2, y2, class_name, food_index, food_name, food_amount = item if class_name == 'dish': new_results.append(item) print('1.new_results:', new_results) for item in results: x1, y1, x2, y2, class_name, food_index, food_name, food_amount = item if class_name == 'food': is_find_dish = False for dish_i, dish_item in enumerate(new_results): d_x1, d_y1, d_x2, d_y2, _, _, _, dish_amount = dish_item # check overlap overlap_ratio = self.get_overlap_ratio_meal( food_bbox=[x1, y1, x2, y2], dish_bbox=[d_x1, d_y1, d_x2, d_y2]) if overlap_ratio > 0.9: new_results[dish_i][5] = food_index new_results[dish_i][6] = food_name new_results[dish_i][7] += food_amount is_find_dish = True if not is_find_dish: new_results.append(item) print('2.new_results:', new_results) rep_drink = 0, 0, 0, 0, 'drink', -1, -1, 200 rep_food = 0, 0, 0, 0, 'food', -1, -1, 200 for dish_i, dish_item in enumerate(new_results): # x1, y1, x2, y2, class_name, food_index, food_name, food_amount = item # new_results[dish_i][4] = 'food' # new_results[dish_i][6] = 'food' if new_results[dish_i][5] == 94 or new_results[dish_i][5] == 64: new_results[dish_i][4] = 'drink' else: new_results[dish_i][4] = 'food' new_amount = new_results[dish_i][7] if new_amount > 1.0: new_amount = 1.0 if new_amount < 0.0: new_amount = 0.0 new_results[dish_i][7] = int(round(new_amount * 100)) if new_results[dish_i][ 4] == 'drink' and new_results[dish_i][7] < rep_drink[7]: rep_drink = new_results[dish_i] if new_results[dish_i][ 4] == 'food' and new_results[dish_i][7] < rep_food[7]: rep_food = new_results[dish_i] old_results = copy.copy(results) results = [] if rep_drink[7] <= 100: results.append(rep_drink) if rep_food[7] <= 100: results.append(rep_food) # dish-food converter - end print('3.results: ', results) # drink one, food one # if self.save_result: # for item in old_results: # # item = [x1, y1, x2, y2, category, (food_index), (food_name), (amount)] # if item[4] == 'food': # str_name = '%s (%s, %s, %.2f)' % (item[4], item[5], item[6], item[7]) # else: # str_name = '%s' % (item[0]) # # bbox_draw = np.array([[item[0], item[1], item[2], item[3], 1.0]]) # # color_index = 0 # im2show = vis_detections_korean_ext2(im2show, str_name, bbox_draw, # box_color=self.list_box_color[color_index], text_color=(255, 255, 255), # text_bg_color=self.list_box_color[color_index], fontsize=20, # thresh=self.vis_th, # draw_score=False, draw_text_out_of_box=False) if self.save_result: for item in results: # item = [x1, y1, x2, y2, category, (food_name), (amount)] if item[4] == 'food' or item[4] == 'drink': str_name = '%s (%.2f)' % (item[4], item[7]) else: str_name = '%s' % (item[4]) bbox_draw = np.array( [[item[0], item[1], item[2], item[3], 1.0]]) color_index = 1 im2show = vis_detections_korean_ext2( im2show, str_name, bbox_draw, box_color=self.list_box_color[color_index], text_color=(255, 255, 255), text_bg_color=self.list_box_color[color_index], fontsize=20, thresh=self.vis_th, draw_score=False, draw_text_out_of_box=True) if self.vis: cv2.imwrite('debug.png', im2show) #print('debug.png is saved.') return results, im2show
if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(1, -1, 4 * len(imdb.classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= data[1][0][2] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() if vis: im = cv2.imread(imdb.image_path_at(i)) im2show = np.copy(im)
# predicted boxes boxes = rois.data[:, :, 1:5] if cfg.TEST.BBOX_REG: box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: if args.class_agnostic: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(batch_size, -1, 4) else: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() box_deltas = box_deltas.view(batch_size, -1, 4 * len(imagenet_vid_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, batch_size) pred_boxes = clip_boxes(pred_boxes, im_info.data, batch_size) else: # Simply repeat the boxes, once for each class raise NotImplementedError # Assume scales are same for frames in the same video im_scale = im_info.data[0][-1] pred_boxes /= im_scale pred_boxes = pred_boxes.squeeze() scores = scores.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() for j in xrange(1, imdb.num_classes):