def seg_maskrcnnresults(): classifier = PointNetInstanceSeg(n_classes=opt.n_cat) if opt.model != '': classifier.load_state_dict(torch.load(opt.model)) classifier.cuda() classifier = classifier.eval() if opt.dataset == 'Real': file_path = os.path.join(opt.dataset, 'test_list.txt') cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084 result_dir = 'results/mrcnn_results/{}_test_pointnet_seg'.format( opt.dataset) else: file_path = os.path.join(opt.dataset, 'val_list.txt') cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5 result_dir = 'results/mrcnn_results/{}_val_pointnet_seg'.format( opt.dataset) if not os.path.exists(result_dir): os.makedirs(result_dir) norm_scale = 1000.0 xmap = np.array([[i for i in range(640)] for j in range(480)]) ymap = np.array([[j for i in range(640)] for j in range(480)]) # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] t_start = time.time() for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) depth = load_depth(img_path) # load mask-rcnn detection results img_path_parsing = img_path.split('/') mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) num_insts = len(mrcnn_result['class_ids']) f_mask = np.zeros((num_insts, depth.shape[0], depth.shape[1]), dtype=int) # prepare frame data f_points, f_choose, f_catId = [], [], [] valid_inst = [] result = {} for i in range(num_insts): cat_id = mrcnn_result['class_ids'][i] - 1 rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) # sample points depth_vaild = depth > 0 choose_depth = depth_vaild[rmin:rmax, cmin:cmax].flatten().nonzero()[0] if len(choose_depth) < 32: continue else: valid_inst.append(i) # process objects with valid depth observation if len(choose_depth) > opt.n_pts: c_mask = np.zeros(len(choose_depth), dtype=int) c_mask[:opt.n_pts] = 1 np.random.shuffle(c_mask) choose_depth = choose_depth[c_mask.nonzero()] else: choose_depth = np.pad(choose_depth, (0, opt.n_pts - len(choose_depth)), 'wrap') depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] pt2 = depth_masked / norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) # Get frustum angle (according to center pixel in 2D BOX) box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0]) depth_center = 1.0 x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy angle_y = -1 * np.arctan2(depth_center, x_center) angle_x = -1 * np.arctan2( (depth_center**2 + x_center**2)**0.5, y_center) # Get point cloud points = get_center_view_point_set( points, angle_y, angle_x) # (n,3) #pts after Frustum rotation f_points.append(points) f_catId.append(cat_id) f_choose.append(choose_depth) if len(valid_inst): f_points = torch.cuda.FloatTensor(f_points) f_catId = torch.cuda.LongTensor(f_catId) f_one_hot_vec = F.one_hot(f_catId, opt.n_cat) f_points = f_points.transpose(2, 1) logits = classifier(f_points, f_one_hot_vec) logits_choice = logits.data.max(2)[1] logits_np = logits_choice.cpu().data.numpy() for i in range(len(valid_inst)): inst_idx = valid_inst[i] choose_depth = f_choose[i] logits_np_inst = logits_np[i] choose_logits_np = logits_np_inst.nonzero() rmin, rmax, cmin, cmax = get_bbox( mrcnn_result['rois'][inst_idx]) roi_mask = np.zeros(((rmax - rmin) * (cmax - cmin)), dtype=int) roi_mask[choose_depth[choose_logits_np]] = 1 roi_mask = roi_mask.reshape((rmax - rmin, cmax - cmin)) f_mask[inst_idx][rmin:rmax, cmin:cmax] = roi_mask result['class_ids'] = mrcnn_result['class_ids'] result['rois'] = mrcnn_result['rois'] result['scores'] = mrcnn_result['scores'] result['masks'] = (f_mask.transpose(1, 2, 0) > 0) save_path = os.path.join( result_dir, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(save_path, 'wb') as f: cPickle.dump(result, f)
def detect(): # resume model os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu estimator = DeformNet(opt.n_cat, opt.nv_prior) estimator.cuda() estimator.load_state_dict(torch.load(opt.model)) estimator.eval() # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] # frame by frame test t_inference = 0.0 t_umeyama = 0.0 inst_count = 0 img_count = 0 t_start = time.time() for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3] raw_rgb = raw_rgb[:, :, ::-1] raw_depth = load_depth(img_path) # load mask-rcnn detection results img_path_parsing = img_path.split('/') mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) num_insts = len(mrcnn_result['class_ids']) f_sRT = np.zeros((num_insts, 4, 4), dtype=float) f_size = np.zeros((num_insts, 3), dtype=float) # prepare frame data f_points, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], [] valid_inst = [] for i in range(num_insts): cat_id = mrcnn_result['class_ids'][i] - 1 prior = mean_shapes[cat_id] rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) mask = np.logical_and(mrcnn_result['masks'][:, :, i], raw_depth > 0) choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] # no depth observation for background in CAMERA dataset # beacuase of how we compute the bbox in function get_bbox # there might be a chance that no foreground points after cropping the mask # cuased by false positive of mask_rcnn, most of the regions are background if len(choose) < 32: f_sRT[i] = np.identity(4, dtype=float) f_size[i] = 2 * np.amax(np.abs(prior), axis=0) continue else: valid_inst.append(i) # process objects with valid depth observation if len(choose) > opt.n_pts: c_mask = np.zeros(len(choose), dtype=int) c_mask[:opt.n_pts] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap') depth_masked = raw_depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] pt2 = depth_masked / norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) rgb = raw_rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR) rgb = norm_color(rgb) crop_w = rmax - rmin ratio = opt.img_size / crop_w col_idx = choose % crop_w row_idx = choose // crop_w choose = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64) # concatenate instances f_points.append(points) f_rgb.append(rgb) f_choose.append(choose) f_catId.append(cat_id) f_prior.append(prior) if len(valid_inst): f_points = torch.cuda.FloatTensor(f_points) f_rgb = torch.stack(f_rgb, dim=0).cuda() f_choose = torch.cuda.LongTensor(f_choose) f_catId = torch.cuda.LongTensor(f_catId) f_prior = torch.cuda.FloatTensor(f_prior) # inference torch.cuda.synchronize() t_now = time.time() assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId, f_prior) # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior) inst_shape = f_prior + deltas assign_mat = F.softmax(assign_mat, dim=2) f_coords = torch.bmm(assign_mat, inst_shape) # bs x n_pts x 3 torch.cuda.synchronize() t_inference += (time.time() - t_now) f_coords = f_coords.detach().cpu().numpy() f_points = f_points.cpu().numpy() f_choose = f_choose.cpu().numpy() f_insts = inst_shape.detach().cpu().numpy() t_now = time.time() for i in range(len(valid_inst)): inst_idx = valid_inst[i] choose = f_choose[i] _, choose = np.unique(choose, return_index=True) nocs_coords = f_coords[i, choose, :] f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0) points = f_points[i, choose, :] _, _, _, pred_sRT = estimateSimilarityTransform( nocs_coords, points) if pred_sRT is None: pred_sRT = np.identity(4, dtype=float) f_sRT[inst_idx] = pred_sRT t_umeyama += (time.time() - t_now) img_count += 1 inst_count += len(valid_inst) # save results result = {} with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) result['gt_class_ids'] = gts['class_ids'] result['gt_bboxes'] = gts['bboxes'] result['gt_RTs'] = gts['poses'] result['gt_scales'] = gts['size'] result['gt_handle_visibility'] = gts['handle_visibility'] result['pred_class_ids'] = mrcnn_result['class_ids'] result['pred_bboxes'] = mrcnn_result['rois'] result['pred_scores'] = mrcnn_result['scores'] result['pred_RTs'] = f_sRT result['pred_scales'] = f_size image_short_path = '_'.join(img_path_parsing[-3:]) save_path = os.path.join(result_dir, 'results_{}.pkl'.format(image_short_path)) with open(save_path, 'wb') as f: cPickle.dump(result, f) # write statistics fw = open('{0}/eval_logs.txt'.format(result_dir), 'w') messages = [] messages.append("Total images: {}".format(len(img_list))) messages.append( "Valid images: {}, Total instances: {}, Average: {:.2f}/image". format(img_count, inst_count, inst_count / img_count)) messages.append("Inference time: {:06f} Average: {:06f}/image".format( t_inference, t_inference / img_count)) messages.append("Umeyama time: {:06f} Average: {:06f}/image".format( t_umeyama, t_umeyama / img_count)) messages.append("Total time: {:06f}".format(time.time() - t_start)) for msg in messages: print(msg) fw.write(msg + '\n') fw.close()
def evaluate(): # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] total_count = np.zeros((opt.n_cat, ), dtype=int) acc = np.zeros((opt.n_cat, ), dtype=float) #accuracy pcs = np.zeros((opt.n_cat, ), dtype=float) #precision rcal = np.zeros((opt.n_cat, ), dtype=float) #recall all_dtc_num = 0 no_gt_num = 0 t_start = time.time() for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) raw_depth = load_depth(img_path) # load mask-rcnn detection results img_path_parsing = img_path.split('/') mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) pred_num_insts = len(mrcnn_result['class_ids']) pred_class_ids = mrcnn_result['class_ids'] #load label with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) mask = cv2.imread(img_path + '_mask.png')[:, :, 2] gt_num_insts = len(gts['class_ids']) gt_class_ids = gts['class_ids'] for i in range(pred_num_insts): all_dtc_num += 1 map_to_gt = [] for j in range(len(gt_class_ids)): if gt_class_ids[j] != pred_class_ids[i]: continue rmin1, rmax1, cmin1, cmax1 = get_bbox(mrcnn_result['rois'][i]) rmin2, rmax2, cmin2, cmax2 = get_bbox(gts['bboxes'][j]) pred_box = [cmin1, rmin1, cmax1, rmax1] gt_box = [cmin2, rmin2, cmax2, rmax2] iou = cal_iou(pred_box, gt_box) if iou < opt.iou_thd: continue # match found map_to_gt.append(np.array([j, iou])) if len(map_to_gt) == 0: no_gt_num += 1 continue max_iou_idx = np.argmax(np.array(map_to_gt)[:, 1]) j = int(map_to_gt[max_iou_idx][0]) #calculate segmantation accuracy gt_mask = mask == gts['instance_ids'][j] pre_mask = mrcnn_result['masks'][:, :, i] mask_bias = gt_mask == pre_mask ins_mask_bias = mask_bias[rmin1:rmax1, cmin1:cmax1] mask_TP = np.logical_and(gt_mask, pre_mask) ins_mask_TP = mask_TP[rmin1:rmax1, cmin1:cmax1] ins_depth = raw_depth[rmin1:rmax1, cmin1:cmax1] ins_depth_idxs = np.where(ins_depth > 0) correct_seg_num = np.sum( ins_mask_bias[ins_depth_idxs[0], ins_depth_idxs[1]].astype(float)) TP_seg_num = np.sum(ins_mask_TP[ins_depth_idxs[0], ins_depth_idxs[1]].astype(float)) acc_ins = correct_seg_num / ins_depth_idxs[0].shape[0] pcs_ins = TP_seg_num / np.sum(pre_mask[rmin1:rmax1, cmin1:cmax1][ ins_depth_idxs[0], ins_depth_idxs[1]].astype(float)) rcal_ins = TP_seg_num / np.sum(gt_mask[rmin1:rmax1, cmin1:cmax1][ ins_depth_idxs[0], ins_depth_idxs[1]].astype(float)) total_count[pred_class_ids[i] - 1] += 1 acc[pred_class_ids[i] - 1] += acc_ins pcs[pred_class_ids[i] - 1] += pcs_ins rcal[pred_class_ids[i] - 1] += rcal_ins # compute accuracy catId_to_name = { 0: 'bottle', 1: 'bowl', 2: 'camera', 3: 'can', 4: 'laptop', 5: 'mug' } acc, pcs, rcal = 100 * (acc / total_count), 100 * ( pcs / total_count), 100 * (rcal / total_count) overall_acc, overall_pcs, overall_rcal = np.mean(acc), np.mean( pcs), np.mean(rcal) no_gt_ratio = 100 * (no_gt_num / all_dtc_num) fw = open('{0}/seg_acc_pcs_rcal.txt'.format(result_dir), 'a') messages = [] messages.append('segmantation results:') messages.append('{:>12s}{:>12s}{:>12s}{:>12s}'.format( 'category', 'accuracy', 'precision', 'recall')) for i in range(acc.shape[0]): messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format( catId_to_name[i], acc[i], pcs[i], rcal[i])) messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format( 'overall', overall_acc, overall_pcs, overall_rcal)) messages.append("{:>12s}{:>12.2f}".format('mismatch', no_gt_ratio)) for msg in messages: print(msg) fw.write(msg + '\n') fw.close()
def __getitem__(self, index): data_parsing = self.data_list[index].split('_') assert self.source in ['CAMERA', 'CAMERA+Real'] if 'scene' in data_parsing[0]: img_path = os.path.join(self.data_dir, 'Real', '_'.join(data_parsing[:2])) cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics else: img_path = os.path.join(self.data_dir, 'CAMERA', data_parsing[0]) cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics rgb = cv2.imread(img_path + '_color.png')[:, :, :3] rgb = rgb[:, :, ::-1] mask = cv2.imread(img_path + '_mask.png')[:, :, 2] with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) # select one foreground object inst_id = int(data_parsing[-1]) idx = np.where(np.array(gts['instance_ids']) == inst_id)[0][0] rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx]) # sample points depth = load_depth(img_path) mask = np.equal(mask, inst_id) mask = np.logical_and(mask, depth > 0) depth_vaild = depth > 0 choose_depth = depth_vaild[rmin:rmax, cmin:cmax].flatten().nonzero()[0] seg = mask[rmin:rmax, cmin:cmax].flatten().astype(np.float64) if len(choose_depth) > self.n_pts: c_mask = np.zeros(len(choose_depth), dtype=int) c_mask[:self.n_pts] = 1 np.random.shuffle(c_mask) choose_depth = choose_depth[c_mask.nonzero()] else: choose_depth = np.pad(choose_depth, (0, self.n_pts - len(choose_depth)), 'wrap') seg = seg[choose_depth] depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] pt2 = depth_masked / self.norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) # resize cropped image to standard size and adjust 'choose' accordingly rgb = rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR) crop_w = rmax - rmin ratio = self.img_size / crop_w col_idx = choose_depth % crop_w row_idx = choose_depth // crop_w choose_depth = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64) # Get frustum angle (according to center pixel in 2D BOX) box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0]) depth_center = 1.0 x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy angle_y = -1 * np.arctan2(depth_center, x_center) angle_x = -1 * np.arctan2( (depth_center**2 + x_center**2)**0.5, y_center) # Get point cloud if self.rotate_to_center: # True points = self.get_center_view_point_set( points, angle_y, angle_x) # (n,4) #pts after Frustum rotation # visual_points(points) # label cat_id = gts['class_ids'][idx] - 1 # convert to 0-indexed # data augmentation translation = gts['translations'][idx] if self.mode == 'train': # color jitter rgb = self.colorjitter(Image.fromarray(np.uint8(rgb))) rgb = np.array(rgb) # point shift add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3)) translation = translation + add_t[0] # point jitter add_t = add_t + np.clip( 0.001 * np.random.randn(points.shape[0], 3), -0.005, 0.005) points = np.add(points, add_t) rgb = self.transform(rgb) points = points.astype(np.float32) return points, rgb, seg, cat_id, choose_depth
def __getitem__(self, index): img_path = os.path.join(self.data_dir, self.img_list[index]) rgb = cv2.imread(img_path + '_color.png')[:, :, :3] img = cv2.imread(img_path + '_color.png') rgb = rgb[:, :, ::-1] depth = load_depth(img_path) mask = cv2.imread(img_path + '_mask.png')[:, :, 2] coord = cv2.imread(img_path + '_coord.png')[:, :, :3] coord = coord[:, :, (2, 1, 0)] coord = np.array(coord, dtype=np.float32) / 255 coord[:, :, 2] = 1 - coord[:, :, 2] with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) if 'CAMERA' in img_path.split('/'): cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics else: cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics # select one foreground object idx = random.randint(0, len(gts['instance_ids']) - 1) inst_id = gts['instance_ids'][idx] rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx]) if self.vis: cv2.rectangle(img, (cmin, rmin), (cmax, rmax), (255, 0, 0), 1) cv2.imshow('image', img) cv2.waitKey() # sample points mask = np.equal(mask, inst_id) mask = np.logical_and(mask, depth > 0) choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] if len(choose) > self.n_pts: c_mask = np.zeros(len(choose), dtype=int) c_mask[:self.n_pts] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, self.n_pts - len(choose)), 'wrap') depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] pt2 = depth_masked / self.norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5 # resize cropped image to standard size and adjust 'choose' accordingly rgb = rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR) crop_w = rmax - rmin ratio = self.img_size / crop_w col_idx = choose % crop_w row_idx = choose // crop_w choose = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64) # label cat_id = gts['class_ids'][idx] - 1 # convert to 0-indexed model = self.models[gts['model_list'][idx]].astype( np.float32) # 1024 points prior = self.mean_shapes[cat_id].astype(np.float32) scale = gts['scales'][idx] rotation = gts['rotations'][idx] translation = gts['translations'][idx] # data augmentation if self.mode == 'train': # color jitter rgb = self.colorjitter(Image.fromarray(np.uint8(rgb))) rgb = np.array(rgb) # point shift add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3)) translation = translation + add_t[0] # point jitter add_t = add_t + np.clip( 0.001 * np.random.randn(points.shape[0], 3), -0.005, 0.005) points = np.add(points, add_t) rgb = self.transform(rgb) points = points.astype(np.float32) points_item = np.copy(points) points_fru, points_NP, points_SC = [], [], [] if 'fru' in self.points_process: #get frustum angle box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0]) center_depth = 1.0 center_x = (box2d_center[0] - cam_cx) * center_depth / cam_fx center_y = (cam_cy - box2d_center[1]) * center_depth / cam_fy frustum_angle_for_y = -1 * np.arctan2(center_depth, center_x) frustum_angle_for_x = -1 * np.arctan2( (center_depth**2 + center_x**2)**0.5, center_y) # Get point cloud points_item = get_center_view_point_set(points_item, frustum_angle_for_y, frustum_angle_for_x) points_item = points_item.astype(np.float32) points_fru = np.copy(points_item) if 'NP' in self.points_process: #normalization points' coordinates points_item = np.subtract(points_item, points_item.mean(axis=0)) points_item = points_item.astype(np.float32) points_NP = np.copy(points_item) if 'SC' in self.points_process: #scale the points to the same size of piror x_coplanar = (np.unique(points_item[:, 0])).size == 1 y_coplanar = (np.unique(points_item[:, 1])).size == 1 z_coplanar = (np.unique(points_item[:, 2])).size == 1 if x_coplanar or y_coplanar or z_coplanar: pass else: piror_pcd = o3d.geometry.PointCloud() piror_pcd.points = o3d.utility.Vector3dVector(prior) piror_box = piror_pcd.get_axis_aligned_bounding_box() piror_extent = piror_box.get_extent() points_item_pcd = o3d.geometry.PointCloud() points_item_pcd.points = o3d.utility.Vector3dVector( points_item) points_item_box = points_item_pcd.get_oriented_bounding_box() points_item_extent = points_item_box.extent scale_ = np.linalg.norm(piror_extent) / np.linalg.norm( points_item_extent) box_points = points_item_box.get_box_points() box_points_np = np.zeros((8, 3)) for i in range(8): box_points_np[i] = box_points.pop() points_item_box_center = np.mean(box_points_np, axis=0) points_item_pcd.scale(scale_, points_item_box_center) points_item = np.asarray(points_item_pcd.points) points_item = points_item.astype(np.float32) points_SC = np.copy(points_item) # if self.points_process == 'fru': # points_pro = points_fru # elif self.points_process == 'NP': # #normalization points' coordinates # points_NP = np.subtract(points_fru, points_fru.mean(axis=0)) # points_NP = points_NP.astype(np.float32) # points_pro = points_NP # else: # print('points_process error flag') #visulization of points before and after process points_pro = points_item if self.vis: visual_points(points, points_fru, points_NP, points_SC, prior) # adjust nocs coords for mug category if cat_id == 5: T0 = self.mug_meta[gts['model_list'][idx]][0] s0 = self.mug_meta[gts['model_list'][idx]][1] nocs = s0 * (nocs + T0) # map ambiguous rotation to canonical rotation if cat_id in self.sym_ids: rotation = gts['rotations'][idx] # assume continuous axis rotation symmetry theta_x = rotation[0, 0] + rotation[2, 2] theta_y = rotation[0, 2] - rotation[2, 0] r_norm = math.sqrt(theta_x**2 + theta_y**2) s_map = np.array([[theta_x / r_norm, 0.0, -theta_y / r_norm], [0.0, 1.0, 0.0], [theta_y / r_norm, 0.0, theta_x / r_norm]]) rotation = rotation @ s_map nocs = nocs @ s_map sRT = np.identity(4, dtype=np.float32) sRT[:3, :3] = scale * rotation sRT[:3, 3] = translation nocs = nocs.astype(np.float32) return points, points_pro, rgb, choose, cat_id, model, prior, sRT, nocs
def detect(): # resume model os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu estimator = DeformNet(opt.n_cat, opt.nv_prior) estimator.cuda() estimator.load_state_dict(torch.load(opt.model)) estimator.eval() # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] # frame by frame test t_inference = 0.0 t_umeyama = 0.0 inst_count = 0 img_count = 0 t_start = time.time() for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3] img = cv2.imread(img_path + '_color.png') raw_rgb = raw_rgb[:, :, ::-1] raw_depth = load_depth(img_path) # load detection results by mask-rcnn or mask-rcnn+fusseg img_path_parsing = img_path.split('/') if opt.fusseg: if opt.data == 'val': mrcnn_path = os.path.join( 'results/mrcnn_results', 'CAMERA_val_fus_seg', 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) else: mrcnn_path = os.path.join( 'results/mrcnn_results', 'Real_test_fus_seg', 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) else: mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) num_insts = len(mrcnn_result['class_ids']) f_sRT = np.zeros((num_insts, 4, 4), dtype=float) f_size = np.zeros((num_insts, 3), dtype=float) # prepare frame data f_points, f_points_pro, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], [], [] valid_inst = [] for i in range(num_insts): cat_id = mrcnn_result['class_ids'][i] - 1 prior = mean_shapes[cat_id] rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) mask = np.logical_and(mrcnn_result['masks'][:, :, i], raw_depth > 0) choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] # no depth observation for background in CAMERA dataset # beacuase of how we compute the bbox in function get_bbox # there might be a chance that no foreground points after cropping the mask # cuased by false positive of mask_rcnn, most of the regions are background if len(choose) < 32: f_sRT[i] = np.identity(4, dtype=float) f_size[i] = 2 * np.amax(np.abs(prior), axis=0) continue else: valid_inst.append(i) # process objects with valid depth observation if len(choose) > opt.n_pts: c_mask = np.zeros(len(choose), dtype=int) c_mask[:opt.n_pts] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap') depth_masked = raw_depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] pt2 = depth_masked / norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) #point normaliztion or not points_item = np.copy(points) if opt.points_process != '': points_fru, points_NP, points_SC = [], [], [] if 'fru' in opt.points_process: #get frustum angle box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0]) center_depth = 1.0 center_x = (box2d_center[0] - cam_cx) * center_depth / cam_fx center_y = (cam_cy - box2d_center[1]) * center_depth / cam_fy frustum_angle_for_y = -1 * np.arctan2( center_depth, center_x) frustum_angle_for_x = -1 * np.arctan2( (center_depth**2 + center_x**2)**0.5, center_y) # Get point cloud points_item = get_center_view_point_set( points_item, frustum_angle_for_y, frustum_angle_for_x) points_item = points_item.astype(np.float32) points_fru = np.copy(points_item) if 'NP' in opt.points_process: #normalization points' coordinates points_item = np.subtract(points_item, points_item.mean(axis=0)) points_item = points_item.astype(np.float32) points_NP = np.copy(points_item) if 'SC' in opt.points_process: x_coplanar = (np.unique(points_item[:, 0])).size == 1 y_coplanar = (np.unique(points_item[:, 1])).size == 1 z_coplanar = (np.unique(points_item[:, 2])).size == 1 if x_coplanar or y_coplanar or z_coplanar: print('coplanar img_path: {}'.format(img_path)) #scale the points to the same size of piror else: piror_pcd = o3d.geometry.PointCloud() piror_pcd.points = o3d.utility.Vector3dVector(prior) piror_box = piror_pcd.get_axis_aligned_bounding_box() piror_extent = piror_box.get_extent() points_item_pcd = o3d.geometry.PointCloud() points_item_pcd.points = o3d.utility.Vector3dVector( points_item) points_item_box = points_item_pcd.get_oriented_bounding_box( ) points_item_extent = points_item_box.extent scale_ = np.linalg.norm(piror_extent) / np.linalg.norm( points_item_extent) box_points = points_item_box.get_box_points() box_points_np = np.zeros((8, 3)) for i in range(8): box_points_np[i] = box_points.pop() points_item_box_center = np.mean(box_points_np, axis=0) points_item_pcd.scale(scale_, points_item_box_center) points_item = np.asarray(points_item_pcd.points) points_item = points_item.astype(np.float32) points_SC = np.copy(points_item) if opt.vis: cv2.rectangle(img, (cmin, rmin), (cmax, rmax), (255, 0, 0), 1) cv2.imshow('image', img) cv2.waitKey() visual_points(points, points_fru, points_NP, points_SC, prior) points_pro = points_item # if opt.vis: # cv2.rectangle(img, (cmin,rmin), (cmax,rmax), (255,0,0), 1) # cv2.imshow('image', img) # cv2.waitKey() # visual_points(points_pro, prior) rgb = raw_rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR) rgb = norm_color(rgb) crop_w = rmax - rmin ratio = opt.img_size / crop_w col_idx = choose % crop_w row_idx = choose // crop_w choose = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64) # concatenate instances f_points.append(points) f_points_pro.append(points_pro) f_rgb.append(rgb) f_choose.append(choose) f_catId.append(cat_id) f_prior.append(prior) if len(valid_inst): f_points = torch.cuda.FloatTensor(f_points) f_points_pro = torch.cuda.FloatTensor(f_points_pro) f_rgb = torch.stack(f_rgb, dim=0).cuda() f_choose = torch.cuda.LongTensor(f_choose) f_catId = torch.cuda.LongTensor(f_catId) f_prior = torch.cuda.FloatTensor(f_prior) # inference torch.cuda.synchronize() t_now = time.time() assign_mat, deltas = estimator(f_points_pro, f_rgb, f_choose, f_catId, f_prior) # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior) inst_shape = f_prior + deltas assign_mat = F.softmax(assign_mat, dim=2) f_coords = torch.bmm(assign_mat, inst_shape) # bs x n_pts x 3 torch.cuda.synchronize() t_inference += (time.time() - t_now) f_coords = f_coords.detach().cpu().numpy() f_points = f_points.cpu().numpy() f_choose = f_choose.cpu().numpy() f_insts = inst_shape.detach().cpu().numpy() t_now = time.time() for i in range(len(valid_inst)): inst_idx = valid_inst[i] choose = f_choose[i] _, choose = np.unique(choose, return_index=True) nocs_coords = f_coords[i, choose, :] f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0) points = f_points[i, choose, :] _, _, _, pred_sRT = estimateSimilarityTransform( nocs_coords, points) if pred_sRT is None: pred_sRT = np.identity(4, dtype=float) f_sRT[inst_idx] = pred_sRT t_umeyama += (time.time() - t_now) img_count += 1 inst_count += len(valid_inst) # save results result = {} with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) result['gt_class_ids'] = gts['class_ids'] result['gt_bboxes'] = gts['bboxes'] result['gt_RTs'] = gts['poses'] result['gt_scales'] = gts['size'] result['gt_handle_visibility'] = gts['handle_visibility'] result['pred_class_ids'] = mrcnn_result['class_ids'] result['pred_bboxes'] = mrcnn_result['rois'] result['pred_scores'] = mrcnn_result['scores'] result['pred_RTs'] = f_sRT result['pred_scales'] = f_size image_short_path = '_'.join(img_path_parsing[-3:]) save_path = os.path.join(opt.result_dir, 'results_{}.pkl'.format(image_short_path)) with open(save_path, 'wb') as f: cPickle.dump(result, f) # write statistics fw = open('{0}/eval_logs.txt'.format(opt.result_dir), 'w') messages = [] messages.append("Total images: {}".format(len(img_list))) messages.append( "Valid images: {}, Total instances: {}, Average: {:.2f}/image". format(img_count, inst_count, inst_count / img_count)) messages.append("Inference time: {:06f} Average: {:06f}/image".format( t_inference, t_inference / img_count)) messages.append("Umeyama time: {:06f} Average: {:06f}/image".format( t_umeyama, t_umeyama / img_count)) messages.append("Total time: {:06f}".format(time.time() - t_start)) for msg in messages: print(msg) fw.write(msg + '\n') fw.close()
def __getitem__(self, index): img_path = os.path.join(self.data_dir, self.img_list[index]) rgb = cv2.imread(img_path + '_color.png')[:, :, :3] rgb = rgb[:, :, ::-1] depth = load_depth(img_path) mask = cv2.imread(img_path + '_mask.png')[:, :, 2] coord = cv2.imread(img_path + '_coord.png')[:, :, :3] coord = coord[:, :, (2, 1, 0)] coord = np.array(coord, dtype=np.float32) / 255 coord[:, :, 2] = 1 - coord[:, :, 2] with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) if 'CAMERA' in img_path.split('/'): cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics else: cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics # select one foreground object idx = random.randint(0, len(gts['instance_ids'])-1) inst_id = gts['instance_ids'][idx] rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx]) # sample points mask = np.equal(mask, inst_id) mask = np.logical_and(mask, depth > 0) choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] if len(choose) > self.n_pts: c_mask = np.zeros(len(choose), dtype=int) c_mask[:self.n_pts] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, self.n_pts-len(choose)), 'wrap') depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] pt2 = depth_masked / self.norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5 # resize cropped image to standard size and adjust 'choose' accordingly rgb = rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR) crop_w = rmax - rmin ratio = self.img_size / crop_w col_idx = choose % crop_w row_idx = choose // crop_w choose = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64) # label cat_id = gts['class_ids'][idx] - 1 # convert to 0-indexed model = self.models[gts['model_list'][idx]].astype(np.float32) # 1024 points prior = self.mean_shapes[cat_id].astype(np.float32) scale = gts['scales'][idx] rotation = gts['rotations'][idx] translation = gts['translations'][idx] # data augmentation if self.mode == 'train': # color jitter rgb = self.colorjitter(Image.fromarray(np.uint8(rgb))) rgb = np.array(rgb) # point shift add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3)) translation = translation + add_t[0] # point jitter add_t = add_t + np.clip(0.001*np.random.randn(points.shape[0], 3), -0.005, 0.005) points = np.add(points, add_t) rgb = self.transform(rgb) points = points.astype(np.float32) # adjust nocs coords for mug category if cat_id == 5: T0 = self.mug_meta[gts['model_list'][idx]][0] s0 = self.mug_meta[gts['model_list'][idx]][1] nocs = s0 * (nocs + T0) # map ambiguous rotation to canonical rotation if cat_id in self.sym_ids: rotation = gts['rotations'][idx] # assume continuous axis rotation symmetry theta_x = rotation[0, 0] + rotation[2, 2] theta_y = rotation[0, 2] - rotation[2, 0] r_norm = math.sqrt(theta_x**2 + theta_y**2) s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm], [0.0, 1.0, 0.0 ], [theta_y/r_norm, 0.0, theta_x/r_norm]]) rotation = rotation @ s_map nocs = nocs @ s_map sRT = np.identity(4, dtype=np.float32) sRT[:3, :3] = scale * rotation sRT[:3, 3] = translation nocs = nocs.astype(np.float32) return points, rgb, choose, cat_id, model, prior, sRT, nocs
def detect(): # resume model os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu estimator = DeformNet(opt.n_cat, opt.nv_prior) estimator.cuda() estimator.load_state_dict(torch.load(opt.model)) estimator.eval() # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] # TODO: test, chamfer distance chamferD = ChamferLoss() cd_num = torch.zeros(6) prior_cd = torch.zeros(6) deform_cd = torch.zeros(6) for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3] raw_rgb = raw_rgb[:, :, ::-1] raw_depth = load_depth(img_path) # load mask-rcnn detection results img_path_parsing = img_path.split('/') mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) num_insts = len(mrcnn_result['class_ids']) f_sRT = np.zeros((num_insts, 4, 4), dtype=float) f_size = np.zeros((num_insts, 3), dtype=float) # prepare frame data f_points, f_rgb, f_choose, f_catId, f_prior, f_model = [], [], [], [], [], [] valid_inst = [] for i in range(num_insts): cat_id = mrcnn_result['class_ids'][i] - 1 prior = mean_shapes[cat_id] rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) mask = np.logical_and(mrcnn_result['masks'][:, :, i], raw_depth > 0) choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] # no depth observation for background in CAMERA dataset # beacuase of how we compute the bbox in function get_bbox # there might be a chance that no foreground points after cropping the mask # cuased by false positive of mask_rcnn, most of the regions are background if len(choose) < 32: f_sRT[i] = np.identity(4, dtype=float) f_size[i] = 2 * np.amax(np.abs(prior), axis=0) continue else: valid_inst.append(i) # process objects with valid depth observation if len(choose) > opt.n_pts: c_mask = np.zeros(len(choose), dtype=int) c_mask[:opt.n_pts] = 1 np.random.shuffle(c_mask) choose = choose[c_mask.nonzero()] else: choose = np.pad(choose, (0, opt.n_pts - len(choose)), 'wrap') depth_masked = raw_depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] pt2 = depth_masked / norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) rgb = raw_rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR) rgb = norm_color(rgb) crop_w = rmax - rmin ratio = opt.img_size / crop_w col_idx = choose % crop_w row_idx = choose // crop_w choose = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64) # concatenate instances try: idx_gt = np.argwhere(gts['class_ids'] - 1 == cat_id).item() except: valid_inst.remove(i) continue model = models[gts['model_list'][idx_gt]].astype( np.float32) # 1024 points f_model.append(model) f_points.append(points) f_rgb.append(rgb) f_choose.append(choose) f_catId.append(cat_id) f_prior.append(prior) if len(valid_inst): f_points = torch.cuda.FloatTensor(f_points) f_rgb = torch.stack(f_rgb, dim=0).cuda() f_choose = torch.cuda.LongTensor(f_choose) f_catId = torch.cuda.LongTensor(f_catId) f_prior = torch.cuda.FloatTensor(f_prior) f_model = torch.cuda.FloatTensor(f_model) # inference torch.cuda.synchronize() assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId, f_prior) # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior) # reconstruction points inst_shape = f_prior + deltas.detach() for i in range(len(valid_inst)): prior_loss, _, _ = chamferD(f_prior[i].unsqueeze(0), f_model[i].unsqueeze(0)) deform_loss, _, _ = chamferD(inst_shape[i].unsqueeze(0), f_model[i].unsqueeze(0)) idx = f_catId[i] cd_num[idx] += 1 prior_cd[idx] += prior_loss.item() deform_cd[idx] += deform_loss.item() deform_cd_metric = (deform_cd / cd_num) * 1000 print( "recon: {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f}". format(deform_cd_metric[0], deform_cd_metric[1], deform_cd_metric[2], deform_cd_metric[3], deform_cd_metric[4], deform_cd_metric[5], torch.mean(deform_cd_metric))) prior_cd_metric = (prior_cd / cd_num) * 1000 print( "prior: {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f} , {:.2f}". format(prior_cd_metric[0], prior_cd_metric[1], prior_cd_metric[2], prior_cd_metric[3], prior_cd_metric[4], prior_cd_metric[5], torch.mean(prior_cd_metric)))
def seg_maskrcnnresults(): classifier = FusionInstanceSeg(n_classes=opt.n_cat) if opt.model != '': classifier.load_state_dict(torch.load(opt.model)) classifier.cuda() classifier = classifier.eval() if opt.dataset == 'Real': file_path = os.path.join(opt.dataset, 'test_list.txt') cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084 result_dir = 'results/mrcnn_results/{}_test_fus_seg'.format( opt.dataset) else: file_path = os.path.join(opt.dataset, 'val_list.txt') cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5 result_dir = 'results/mrcnn_results/{}_val_fus_seg'.format(opt.dataset) if not os.path.exists(result_dir): os.makedirs(result_dir) norm_scale = 1000.0 norm_color = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) xmap = np.array([[i for i in range(640)] for j in range(480)]) ymap = np.array([[j for i in range(640)] for j in range(480)]) # get test data list img_list = [ os.path.join(file_path.split('/')[0], line.rstrip('\n')) for line in open(os.path.join(opt.data_dir, file_path)) ] total_count = np.zeros((opt.n_cat, ), dtype=int) acc = np.zeros((opt.n_cat, ), dtype=float) #accuracy pcs = np.zeros((opt.n_cat, ), dtype=float) #precision rcal = np.zeros((opt.n_cat, ), dtype=float) #recall all_dtc_num = 0 no_gt_num = 0 t_start = time.time() for path in tqdm(img_list): img_path = os.path.join(opt.data_dir, path) raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3] raw_rgb = raw_rgb[:, :, ::-1] depth = load_depth(img_path) #load label with open(img_path + '_label.pkl', 'rb') as f: gts = cPickle.load(f) gt_mask = cv2.imread(img_path + '_mask.png')[:, :, 2] gt_num_insts = len(gts['class_ids']) gt_class_ids = gts['class_ids'] # load mask-rcnn detection results img_path_parsing = img_path.split('/') mrcnn_path = os.path.join( 'results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(mrcnn_path, 'rb') as f: mrcnn_result = cPickle.load(f) num_insts = len(mrcnn_result['class_ids']) mrcnn_class_ids = mrcnn_result['class_ids'] f_mask = np.zeros((num_insts, depth.shape[0], depth.shape[1]), dtype=int) # prepare frame data f_points, f_rgb, f_choose, f_catId = [], [], [], [] f_raw_choose = [] valid_inst = [] result = {} for i in range(num_insts): cat_id = mrcnn_result['class_ids'][i] - 1 rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) # sample points depth_vaild = depth > 0 choose_depth = depth_vaild[rmin:rmax, cmin:cmax].flatten().nonzero()[0] if len(choose_depth) < 32: continue else: valid_inst.append(i) # process objects with valid depth observation if len(choose_depth) > opt.n_pts: c_mask = np.zeros(len(choose_depth), dtype=int) c_mask[:opt.n_pts] = 1 np.random.shuffle(c_mask) choose_depth = choose_depth[c_mask.nonzero()] else: choose_depth = np.pad(choose_depth, (0, opt.n_pts - len(choose_depth)), 'wrap') depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose_depth][:, np.newaxis] pt2 = depth_masked / norm_scale pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy points = np.concatenate((pt0, pt1, pt2), axis=1) # Get frustum angle (according to center pixel in 2D BOX) box2d_center = np.array([(cmin + cmax) / 2.0, (rmin + rmax) / 2.0]) depth_center = 1.0 x_center = (box2d_center[0] - cam_cx) * depth_center / cam_fx y_center = (cam_cy - box2d_center[1]) * depth_center / cam_fy angle_y = -1 * np.arctan2(depth_center, x_center) angle_x = -1 * np.arctan2( (depth_center**2 + x_center**2)**0.5, y_center) # Get point cloud points = get_center_view_point_set( points, angle_y, angle_x) # (n,3) #pts after Frustum rotation rgb = raw_rgb[rmin:rmax, cmin:cmax, :] rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR) rgb = norm_color(rgb) crop_w = rmax - rmin ratio = opt.img_size / crop_w col_idx = choose_depth % crop_w row_idx = choose_depth // crop_w raw_choose = np.copy(choose_depth) choose_depth = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64) f_points.append(points) f_rgb.append(rgb) f_catId.append(cat_id) f_choose.append(choose_depth) f_raw_choose.append(raw_choose) if len(valid_inst): f_points = torch.cuda.FloatTensor(f_points) f_rgb = torch.stack(f_rgb, dim=0).cuda() f_catId = torch.cuda.LongTensor(f_catId) f_one_hot_vec = F.one_hot(f_catId, opt.n_cat) f_choose = torch.cuda.LongTensor(f_choose) f_points = f_points.transpose(2, 1) logits = classifier(f_points, f_rgb, f_one_hot_vec, f_choose) logits_choice = logits.data.max(2)[1] logits_np = logits_choice.cpu().data.numpy() f_choose = f_choose.cpu().numpy() for i in range(len(valid_inst)): inst_idx = valid_inst[i] choose_depth = f_choose[i] raw_choose = f_raw_choose[i] logits_np_inst = logits_np[i] choose_logits_np = logits_np_inst.nonzero() rmin, rmax, cmin, cmax = get_bbox( mrcnn_result['rois'][inst_idx]) roi_mask = np.zeros(((rmax - rmin) * (cmax - cmin)), dtype=int) roi_mask[raw_choose[choose_logits_np]] = 1 roi_mask = roi_mask.reshape((rmax - rmin, cmax - cmin)) f_mask[inst_idx][rmin:rmax, cmin:cmax] = roi_mask all_dtc_num += 1 map_to_gt = [] for j in range(len(gt_class_ids)): if gt_class_ids[j] != mrcnn_class_ids[inst_idx]: continue pred_box = [cmin, rmin, cmax, rmax] rmin2, rmax2, cmin2, cmax2 = get_bbox(gts['bboxes'][j]) gt_box = [cmin2, rmin2, cmax2, rmax2] iou = cal_iou(pred_box, gt_box) if iou < opt.iou_thd: continue # match found map_to_gt.append(np.array([j, iou])) if len(map_to_gt) == 0: no_gt_num += 1 else: max_iou_idx = np.argmax(np.array(map_to_gt)[:, 1]) j = int(map_to_gt[max_iou_idx][0]) gt_mask_ins = gt_mask == gts['instance_ids'][j] gt_roi_mask = gt_mask_ins[rmin:rmax, cmin:cmax] raw_choose, choose_raw_choose = np.unique( raw_choose, return_index=True) gt_logits = gt_roi_mask.flatten()[raw_choose] logits_bias = logits_np_inst[ choose_raw_choose] == gt_logits logits_TP = np.logical_and( logits_np_inst[choose_raw_choose], gt_logits) correct_seg_num = np.sum(np.array(logits_bias)) TP_seg_num = np.sum(np.array(logits_TP)) acc_ins = correct_seg_num / len(raw_choose) pcs_ins = TP_seg_num / np.sum( logits_np_inst[choose_raw_choose]) rcal_ins = TP_seg_num / np.sum(gt_logits) total_count[mrcnn_class_ids[inst_idx] - 1] += 1 acc[mrcnn_class_ids[inst_idx] - 1] += acc_ins pcs[mrcnn_class_ids[inst_idx] - 1] += pcs_ins rcal[mrcnn_class_ids[inst_idx] - 1] += rcal_ins result['class_ids'] = mrcnn_result['class_ids'] result['rois'] = mrcnn_result['rois'] result['scores'] = mrcnn_result['scores'] result['masks'] = (f_mask.transpose(1, 2, 0) > 0) if opt.save_pkl: save_path = os.path.join( result_dir, 'results_{}_{}_{}.pkl'.format( opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) with open(save_path, 'wb') as f: cPickle.dump(result, f) # compute accuracy catId_to_name = { 0: 'bottle', 1: 'bowl', 2: 'camera', 3: 'can', 4: 'laptop', 5: 'mug' } acc, pcs, rcal = 100 * (acc / total_count), 100 * ( pcs / total_count), 100 * (rcal / total_count) overall_acc, overall_pcs, overall_rcal = np.mean(acc), np.mean( pcs), np.mean(rcal) no_gt_ratio = 100 * (no_gt_num / all_dtc_num) fw = open('{0}/seg_acc_pcs.txt'.format(result_dir), 'a') messages = [] messages.append('segmantation results:') messages.append('{:>12s}{:>12s}{:>12s}{:>12s}'.format( 'category', 'accuracy', 'precision', 'recall')) for i in range(acc.shape[0]): messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format( catId_to_name[i], acc[i], pcs[i], rcal[i])) messages.append("{:>12s}{:>12.2f}{:>12.2f}{:>12.2f}".format( 'overall', overall_acc, overall_pcs, overall_rcal)) messages.append("{:>12s}{:>12.2f}".format('mismatch', no_gt_ratio)) for msg in messages: print(msg) fw.write(msg + '\n') fw.close()