def main(args): if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.USE_GPU_NMS = args.cuda np.random.seed(cfg.RNG_SEED) pascal_classes = np.asarray(['__background__', 'targetobject', 'hand']) args.set_cfgs = [ 'ANCHOR_SCALES', '[8, 16, 32, 64]', 'ANCHOR_RATIOS', '[0.5, 1, 2]' ] # initilize the network here. if args.net == 'vgg16': fasterRCNN = vgg16(pascal_classes, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res101': fasterRCNN = resnet(pascal_classes, 101, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res50': fasterRCNN = resnet(pascal_classes, 50, pretrained=False, class_agnostic=args.class_agnostic) elif args.net == 'res152': fasterRCNN = resnet(pascal_classes, 152, pretrained=False, class_agnostic=args.class_agnostic) else: print("network is not defined") pdb.set_trace() raise Exception fasterRCNN.create_architecture() load_name = 'models/res101_handobj_100K/pascal_voc/faster_rcnn_1_8_132028.pth' print("load checkpoint %s" % (load_name)) if args.cuda > 0: checkpoint = torch.load(load_name) else: checkpoint = torch.load(load_name, map_location=(lambda storage, loc: storage)) fasterRCNN.load_state_dict(checkpoint['model']) if 'pooling_mode' in checkpoint.keys(): cfg.POOLING_MODE = checkpoint['pooling_mode'] print('load model successfully!') lr = cfg.TRAIN.LEARNING_RATE momentum = cfg.TRAIN.MOMENTUM weight_decay = cfg.TRAIN.WEIGHT_DECAY def _get_image_blob(im): """Converts an image into a network input. Arguments: im (ndarray): a color image in BGR order Returns: blob (ndarray): a data blob holding an image pyramid im_scale_factors (list): list of image scales (relative to im) used in the image pyramid """ im_orig = im.astype(np.float32, copy=True) im_orig -= cfg.PIXEL_MEANS im_shape = im_orig.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) processed_ims = [] im_scale_factors = [] for target_size in cfg.TEST.SCALES: im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) im_scale_factors.append(im_scale) processed_ims.append(im) # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, np.array(im_scale_factors) # initilize the tensor holder here. im_data = torch.FloatTensor(1) im_info = torch.FloatTensor(1) num_boxes = torch.LongTensor(1) gt_boxes = torch.FloatTensor(1) box_info = torch.FloatTensor(1) # ship to cuda if args.cuda > 0: im_data = im_data.cuda() im_info = im_info.cuda() num_boxes = num_boxes.cuda() gt_boxes = gt_boxes.cuda() with torch.no_grad(): if args.cuda > 0: cfg.CUDA = True if args.cuda > 0: fasterRCNN.cuda() fasterRCNN.eval() with torch.no_grad(): start = time.time() max_per_image = 100 thresh_hand = args.thresh_hand thresh_obj = args.thresh_obj vis = args.vis # print(f'thresh_hand = {thresh_hand}') # print(f'thnres_obj = {thresh_obj}') webcam_num = args.webcam_num # Set up webcam or get image directories if webcam_num >= 0: cap = cv2.VideoCapture(webcam_num) num_images = 0 else: print(f'image dir = {args.image_dir}') print(f'save dir = {args.save_dir}') imglist = os.listdir(args.image_dir) num_images = len(imglist) print('Loaded Photo: {} images.'.format(num_images)) while (num_images >= 0): total_tic = time.time() if webcam_num == -1: num_images -= 1 # Get image from the webcam if webcam_num >= 0: if not cap.isOpened(): raise RuntimeError( "Webcam could not open. Please check connection.") ret, frame = cap.read() im_in = np.array(frame) # Load the demo image else: im_file = os.path.join(args.image_dir, imglist[num_images]) im_in = np.array(imread(im_file)) # resize # im_in = np.array(Image.fromarray(im_in).resize((640, 360))) if len(im_in.shape) == 2: im_in = im_in[:, :, np.newaxis] im_in = np.concatenate((im_in, im_in, im_in), axis=2) # rgb -> bgr im = im_in[:, :, ::-1] blobs, im_scales = _get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs im_info_np = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) im_data_pt = torch.from_numpy(im_blob) im_data_pt = im_data_pt.permute(0, 3, 1, 2) im_info_pt = torch.from_numpy(im_info_np) with torch.no_grad(): im_data.resize_(im_data_pt.size()).copy_(im_data_pt) im_info.resize_(im_info_pt.size()).copy_(im_info_pt) gt_boxes.resize_(1, 1, 5).zero_() num_boxes.resize_(1).zero_() box_info.resize_(1, 1, 5).zero_() # pdb.set_trace() det_tic = time.time() print(im_data.shape) rois, cls_prob, bbox_pred, rpn_loss_cls, rpn_loss_box, RCNN_loss_cls, RCNN_loss_bbox, rois_label, \ loss_list = fasterRCNN(im_data, im_info, gt_boxes, num_boxes, box_info) scores = cls_prob.data boxes = rois.data[:, :, 1:5] # extact predicted params contact_vector = loss_list[0][0] # hand contact state info offset_vector = loss_list[1][0].detach( ) # offset vector (factored into a unit vector and a magnitude) lr_vector = loss_list[2][0].detach() # hand side info (left/right) # get hand contact _, contact_indices = torch.max(contact_vector, 2) contact_indices = contact_indices.squeeze(0).unsqueeze(-1).float() # get hand side lr = torch.sigmoid(lr_vector) > 0.5 lr = lr.squeeze(0).float() if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred.data if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev if args.class_agnostic: if args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view( -1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS ) + torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4) else: if args.cuda > 0: box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() else: box_deltas = box_deltas.view( -1, 4) * torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_STDS ) + torch.FloatTensor( cfg.TRAIN.BBOX_NORMALIZE_MEANS) box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes)) pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) pred_boxes /= im_scales[0] scores = scores.squeeze() pred_boxes = pred_boxes.squeeze() det_toc = time.time() detect_time = det_toc - det_tic misc_tic = time.time() print(detect_time) if vis: im2show = np.copy(im) obj_dets, hand_dets = None, None for j in xrange(1, len(pascal_classes)): # inds = torch.nonzero(scores[:,j] > thresh).view(-1) if pascal_classes[j] == 'hand': inds = torch.nonzero(scores[:, j] > thresh_hand, as_tuple=False).view(-1) elif pascal_classes[j] == 'targetobject': inds = torch.nonzero(scores[:, j] > thresh_obj, as_tuple=False).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat( (cls_boxes, cls_scores.unsqueeze(1), contact_indices[inds], offset_vector.squeeze(0)[inds], lr[inds]), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if pascal_classes[j] == 'targetobject': obj_dets = cls_dets.cpu().numpy() if pascal_classes[j] == 'hand': hand_dets = cls_dets.cpu().numpy() if vis: # visualization im2show = vis_detections_filtered_objects_PIL( im2show, obj_dets, hand_dets, thresh_hand, thresh_obj) misc_toc = time.time() nms_time = misc_toc - misc_tic if webcam_num == -1: sys.stdout.write( 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r'.format( num_images + 1, len(imglist), detect_time, nms_time)) sys.stdout.flush() if vis and webcam_num == -1: folder_name = args.save_dir os.makedirs(folder_name, exist_ok=True) result_path = os.path.join( folder_name, imglist[num_images][:-4] + "_det.png") im2show.save(result_path) else: im2showRGB = cv2.cvtColor(im2show, cv2.COLOR_BGR2RGB) cv2.imshow("frame", im2showRGB) total_toc = time.time() total_time = total_toc - total_tic frame_rate = 1 / total_time print('Frame rate:', frame_rate) if cv2.waitKey(1) & 0xFF == ord('q'): break if webcam_num >= 0: cap.release() cv2.destroyAllWindows()
cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1), contact_indices[inds], offset_vector.squeeze(0)[inds], lr[inds]), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if pascal_classes[j] == 'targetobject': obj_dets = cls_dets.cpu().numpy() if pascal_classes[j] == 'hand': hand_dets = cls_dets.cpu().numpy() if vis: # visualization im2show = vis_detections_filtered_objects_PIL(im2show, obj_dets, hand_dets, thresh_hand, thresh_obj) misc_toc = time.time() nms_time = misc_toc - misc_tic if webcam_num == -1: sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ .format(num_images + 1, len(imglist), detect_time, nms_time)) sys.stdout.flush() if vis and webcam_num == -1: folder_name = args.save_dir os.makedirs(folder_name, exist_ok=True) result_path = os.path.join(folder_name, imglist[num_images][:-4] + "_det.png") im2show.save(result_path)
(cls_boxes, cls_scores.unsqueeze(1), contact_indices[inds], offset_vector.squeeze(0)[inds], lr[inds]), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if pascal_classes[j] == 'targetobject': obj_dets = cls_dets.cpu().numpy() if pascal_classes[j] == 'hand': hand_dets = cls_dets.cpu().numpy() if vis: # visualization cvimg = vis_detections_filtered_objects_PIL( frame, im2show, obj_dets, hand_dets, thresh_hand, thresh_obj) misc_toc = time.time() nms_time = misc_toc - misc_tic print(nms_time + detect_time) #cv2.imwrite("detectedvideo/detected{}.png".format(c),cvimg) cv2.imshow('detection', cvimg) if args.output: out.write(cvimg) if cv2.waitKey(1) & 0xFF == ord('q'): break # if webcam_num == -1:
_, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat( (cls_boxes, cls_scores.unsqueeze(1), indices[inds, :], hand_vector.squeeze(0)[inds, :], lr[inds, :], nc_prob[inds, :]), 1) cls_dets = cls_dets[order] keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if args.vis: im2show = vis_detections_filtered_objects_PIL( im2show, imdb.classes[j], cls_dets.cpu().numpy(), 0.1) all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = empty_array # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack( [all_boxes[j][i][:, 4] for j in xrange(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in xrange(1, imdb.num_classes): keep = np.where(all_boxes[j][i][:, 4] >= image_thresh)[0] all_boxes[j][i] = all_boxes[j][i][keep, :]