def parse_groundtruths(end_points, config_dict): """ Parse groundtruth labels to OBB parameters. Args: end_points: dict {center_label, heading_class_label, heading_residual_label, size_class_label, size_residual_label, sem_cls_label, box_label_mask} config_dict: dict {dataset_config} Returns: batch_gt_map_cls: a list of len == batch_size (BS) [gt_list_i], i = 0, 1, ..., BS-1 where gt_list_i = [(gt_sem_cls, gt_box_params)_j] where j = 0, ..., num of objects - 1 at sample input i """ center_label = end_points['center_label'] heading_class_label = end_points['heading_class_label'] heading_residual_label = end_points['heading_residual_label'] size_class_label = end_points['size_class_label'] size_residual_label = end_points['size_residual_label'] box_label_mask = end_points['box_label_mask'] sem_cls_label = end_points['sem_cls_label'] bsize = center_label.shape[0] K2 = center_label.shape[1] # K2==MAX_NUM_OBJ gt_corners_3d_upright_camera = np.zeros((bsize, K2, 8, 3)) # gt_center_upright_camera = flip_axis_to_camera(center_label[:,:,0:3].detach().cpu().numpy()) gt_center_upright_camera = center_label[:, :, 0:3].detach().cpu().numpy() for i in range(bsize): for j in range(K2): if box_label_mask[i, j] == 0: continue heading_angle = config_dict['dataset_config'].class2angle( heading_class_label[i, j].detach().cpu().numpy(), heading_residual_label[i, j].detach().cpu().numpy()) box_size = config_dict['dataset_config'].class2size( int(size_class_label[i, j].detach().cpu().numpy()), size_residual_label[i, j].detach().cpu().numpy()) corners_3d_upright_camera = get_3d_box( box_size, heading_angle, gt_center_upright_camera[i, j, :]) gt_corners_3d_upright_camera[i, j] = corners_3d_upright_camera batch_gt_map_cls = [] for i in range(bsize): batch_gt_map_cls.append([ (sem_cls_label[i, j].item(), gt_corners_3d_upright_camera[i, j]) for j in range(gt_corners_3d_upright_camera.shape[1]) if box_label_mask[i, j] == 1 ]) end_points['batch_gt_map_cls'] = batch_gt_map_cls return batch_gt_map_cls
def predictions2corners3d(end_points, config_dict): """ Convert predictions to OBB parameters (eight corner points) Args: end_points: dict {point_clouds, center, heading_scores, heading_residuals, size_scores, size_residuals, sem_cls_scores} config_dict: dict {dataset_config, remove_empty_box, use_3d_nms, nms_iou, use_old_type_nms, conf_thresh, per_class_proposal} Returns: pred_corners_3d_upright_camera: ndarray (num_batch, num_proposals, 8, 3) pred_box_parameters: ndarray (num_batch, num_proposals, 7) """ pred_center = end_points['center'] # B,num_proposal,3 pred_heading_class = torch.argmax(end_points['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather(end_points['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_residual.squeeze_(2) pred_size_class = torch.argmax(end_points['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather(end_points['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat(1,1,1,3)) # B,num_proposal,1,3 pred_size_residual.squeeze_(2) num_proposal = pred_center.shape[1] # Since we operate in upright_depth coord for points, while util functions # assume upright_camera coord. bsize = pred_center.shape[0] pred_box_parameters = np.zeros((bsize, num_proposal, 7), dtype=np.float32) pred_box_parameters[:,:,0:3] = pred_center.detach().cpu().numpy() pred_corners_3d_upright_camera = np.zeros((bsize, num_proposal, 8, 3), dtype=np.float32) pred_center_upright_camera = flip_axis_to_camera(pred_center.detach().cpu().numpy()) for i in range(bsize): for j in range(num_proposal): heading_angle = config_dict['dataset_config'].class2angle(\ pred_heading_class[i,j].detach().cpu().numpy(), pred_heading_residual[i,j].detach().cpu().numpy()) box_size = config_dict['dataset_config'].class2size(\ int(pred_size_class[i,j].detach().cpu().numpy()), pred_size_residual[i,j].detach().cpu().numpy()) pred_box_parameters[i,j,3:6] = box_size pred_box_parameters[i,j,6] = heading_angle corners_3d_upright_camera = get_3d_box(box_size, heading_angle, pred_center_upright_camera[i,j,:]) pred_corners_3d_upright_camera[i,j] = corners_3d_upright_camera return pred_corners_3d_upright_camera, pred_box_parameters
def get_roi_ptcloud(inputs, batch_pred_boxes_params, enlarge_ratio=1.2, num_point_roi=512, min_num_point=100): """ Generate ROI point cloud w.r.t predicted box :param inputs: dict {'point_clouds'} input point clouds of the whole scene batch_pred_boxes_params: (B, num_proposals, 7), numpy array predicted bounding box from detector enlarge_ratio: scalar the value to enlarge the predicted box size num_point_roi: scalar the number of points to be sampled in each enlarged box :return: batch_pc_roi: (B, num_proposals, num_sampled_points, input_pc_features) numpy array nonempty_roi_mask: (B, num_proposals) numpy array """ batch_pc = inputs['point_clouds'].detach().cpu().numpy()[:, :, :] # B,N,C bsize = batch_pred_boxes_params.shape[0] K = batch_pred_boxes_params.shape[1] batch_pc_roi = np.zeros((bsize, K, num_point_roi, batch_pc.shape[2]), dtype=np.float32) nonempty_roi_mask = np.ones((bsize, K)) for i in range(bsize): pc = batch_pc[i, :, :] # (N,C) for j in range(K): box_params = batch_pred_boxes_params[i, j, :] # (7) center = box_params[0:3] center_upright_camera = flip_axis_to_camera(center)#.reshape(1,-1))[0] box_size = box_params[3:6]*enlarge_ratio #enlarge the box size heading_angle = box_params[6] box3d = get_3d_box(box_size, heading_angle, center_upright_camera) box3d = flip_axis_to_depth(box3d) pc_in_box, inds = extract_pc_in_box3d(pc, box3d) # print('The number of points in roi box is ', pc_in_box.shape[0]) if len(pc_in_box) >= min_num_point: batch_pc_roi[i, j, :, :] = random_sampling(pc_in_box, num_point_roi) else: nonempty_roi_mask[i,j] = 0 return batch_pc_roi, nonempty_roi_mask
def groundtruths2corners3d(end_points, config_dict): """ Convert predictions to OBB parameters (eight corner points) Args: end_points: dict {center_label, heading_class_label, heading_residual_label, size_class_label, size_residual_label, sem_cls_label, box_label_mask} config_dict: dict {dataset_config} Returns: gt_corners_3d_upright_camera: ndarray (num_batch, MAX_NUM_OBJ, 8, 3) gt_box_parameters: ndarray (num_batch, num_proposals, 7) """ center_label = end_points['center_label'] heading_class_label = end_points['heading_class_label'] heading_residual_label = end_points['heading_residual_label'] size_class_label = end_points['size_class_label'] size_residual_label = end_points['size_residual_label'] box_label_mask = end_points['box_label_mask'] bsize = center_label.shape[0] K2 = center_label.shape[1] # K2==MAX_NUM_OBJ gt_box_parameters = np.zeros((bsize, K2, 7), dtype=np.float32) gt_box_parameters[:, :, 0:3] = center_label.detach().cpu().numpy() gt_corners_3d_upright_camera = np.zeros((bsize, K2, 8, 3), dtype=np.float32) gt_center_upright_camera = flip_axis_to_camera(center_label[:,:,0:3].detach().cpu().numpy()) for i in range(bsize): for j in range(K2): if box_label_mask[i,j] == 0: continue heading_angle = config_dict['dataset_config'].class2angle(heading_class_label[i,j].detach().cpu().numpy(), heading_residual_label[i,j].detach().cpu().numpy()) box_size = config_dict['dataset_config'].class2size(int(size_class_label[i,j].detach().cpu().numpy()), size_residual_label[i,j].detach().cpu().numpy()) gt_box_parameters[i,j,3:6] = box_size gt_box_parameters[i,j,6] = heading_angle corners_3d_upright_camera = get_3d_box(box_size, heading_angle, gt_center_upright_camera[i,j,:]) gt_corners_3d_upright_camera[i,j] = corners_3d_upright_camera return gt_corners_3d_upright_camera, gt_box_parameters
def dump_results(args, scanrefer, data, config): dump_dir = os.path.join(CONF.PATH.OUTPUT, args.folder, "vis") os.makedirs(dump_dir, exist_ok=True) # from inputs ids = data['scan_idx'].detach().cpu().numpy() point_clouds = data['point_clouds'].cpu().numpy() batch_size = point_clouds.shape[0] pcl_color = data["pcl_color"].detach().cpu().numpy() if args.use_color: pcl_color = (pcl_color * 256 + MEAN_COLOR_RGB).astype(np.int64) # from network outputs # detection pred_objectness = torch.argmax(data['objectness_scores'], 2).float().detach().cpu().numpy() pred_center = data['center'].detach().cpu().numpy() # (B,K,3) pred_heading_class = torch.argmax(data['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather( data['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_class = pred_heading_class.detach().cpu().numpy( ) # B,num_proposal pred_heading_residual = pred_heading_residual.squeeze( 2).detach().cpu().numpy() # B,num_proposal pred_size_class = torch.argmax(data['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather( data['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat( 1, 1, 1, 3)) # B,num_proposal,1,3 pred_size_residual = pred_size_residual.squeeze( 2).detach().cpu().numpy() # B,num_proposal,3 # reference pred_ref_scores = data["cluster_ref"].detach().cpu().numpy() pred_ref_scores_softmax = F.softmax( data["cluster_ref"] * torch.argmax(data['objectness_scores'], 2).float() * data['pred_mask'], dim=1).detach().cpu().numpy() # post-processing nms_masks = data['pred_mask'].detach().cpu().numpy() # B,num_proposal # ground truth gt_center = data['center_label'].cpu().numpy() # (B,MAX_NUM_OBJ,3) gt_heading_class = data['heading_class_label'].cpu().numpy() # B,K2 gt_heading_residual = data['heading_residual_label'].cpu().numpy() # B,K2 gt_size_class = data['size_class_label'].cpu().numpy() # B,K2 gt_size_residual = data['size_residual_label'].cpu().numpy() # B,K2,3 # reference gt_ref_labels = data["ref_box_label"].detach().cpu().numpy() for i in range(batch_size): # basic info idx = ids[i] scene_id = scanrefer[idx]["scene_id"] object_id = scanrefer[idx]["object_id"] object_name = scanrefer[idx]["object_name"] ann_id = scanrefer[idx]["ann_id"] # scene_output scene_dump_dir = os.path.join(dump_dir, scene_id) if not os.path.exists(scene_dump_dir): os.mkdir(scene_dump_dir) # # Dump the original scene point clouds mesh = align_mesh(scene_id) mesh.write(os.path.join(scene_dump_dir, 'mesh.ply')) write_ply_rgb(point_clouds[i], pcl_color[i], os.path.join(scene_dump_dir, 'pc.ply')) # filter out the valid ground truth reference box assert gt_ref_labels[i].shape[0] == gt_center[i].shape[0] gt_ref_idx = np.argmax(gt_ref_labels[i], 0) # visualize the gt reference box # NOTE: for each object there should be only one gt reference box object_dump_dir = os.path.join( dump_dir, scene_id, "gt_{}_{}.ply".format(object_id, object_name)) gt_obb = config.param2obb(gt_center[i, gt_ref_idx, 0:3], gt_heading_class[i, gt_ref_idx], gt_heading_residual[i, gt_ref_idx], gt_size_class[i, gt_ref_idx], gt_size_residual[i, gt_ref_idx]) gt_bbox = get_3d_box(gt_obb[3:6], gt_obb[6], gt_obb[0:3]) if not os.path.exists(object_dump_dir): write_bbox( gt_obb, 0, os.path.join(scene_dump_dir, 'gt_{}_{}.ply'.format(object_id, object_name))) # find the valid reference prediction pred_masks = nms_masks[i] * pred_objectness[i] == 1 assert pred_ref_scores[i].shape[0] == pred_center[i].shape[0] pred_ref_idx = np.argmax(pred_ref_scores[i] * pred_masks, 0) assigned_gt = torch.gather( data["ref_box_label"], 1, data["object_assignment"]).detach().cpu().numpy() # visualize the predicted reference box pred_obb = config.param2obb(pred_center[i, pred_ref_idx, 0:3], pred_heading_class[i, pred_ref_idx], pred_heading_residual[i, pred_ref_idx], pred_size_class[i, pred_ref_idx], pred_size_residual[i, pred_ref_idx]) pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3]) iou = box3d_iou(gt_bbox, pred_bbox) write_bbox( pred_obb, 1, os.path.join( scene_dump_dir, 'pred_{}_{}_{}_{:.5f}_{:.5f}.ply'.format( object_id, object_name, ann_id, pred_ref_scores_softmax[i, pred_ref_idx], iou)))
def predict(args): print("predict bounding boxes...") # constant DC = ScannetDatasetConfig() # init training dataset print("preparing data...") scanrefer, scene_list = get_scanrefer(args) # dataloader _, dataloader = get_dataloader(args, scanrefer, scene_list, "test", DC) # model model = get_model(args, DC) # config POST_DICT = { "remove_empty_box": True, "use_3d_nms": True, "nms_iou": 0.25, "use_old_type_nms": False, "cls_nms": True, "per_class_proposal": True, "conf_thresh": 0.05, "dataset_config": DC } if not args.no_nms else None # predict print("predicting...") pred_bboxes = [] for data_dict in tqdm(dataloader): for key in data_dict: data_dict[key] = data_dict[key].cuda() # feed data_dict = model(data_dict) _, data_dict = get_loss( data_dict=data_dict, config=DC, detection=False, reference=True ) objectness_preds_batch = torch.argmax(data_dict['objectness_scores'], 2).long() if POST_DICT: _ = parse_predictions(data_dict, POST_DICT) nms_masks = torch.LongTensor(data_dict['pred_mask']).cuda() # construct valid mask pred_masks = (nms_masks * objectness_preds_batch == 1).float() else: # construct valid mask pred_masks = (objectness_preds_batch == 1).float() pred_ref = torch.argmax(data_dict['cluster_ref'] * pred_masks, 1) # (B,) pred_center = data_dict['center'] # (B,K,3) pred_heading_class = torch.argmax(data_dict['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather(data_dict['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_class = pred_heading_class # B,num_proposal pred_heading_residual = pred_heading_residual.squeeze(2) # B,num_proposal pred_size_class = torch.argmax(data_dict['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather(data_dict['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat(1,1,1,3)) # B,num_proposal,1,3 pred_size_class = pred_size_class pred_size_residual = pred_size_residual.squeeze(2) # B,num_proposal,3 for i in range(pred_ref.shape[0]): # compute the iou pred_ref_idx = pred_ref[i] pred_obb = DC.param2obb( pred_center[i, pred_ref_idx, 0:3].detach().cpu().numpy(), pred_heading_class[i, pred_ref_idx].detach().cpu().numpy(), pred_heading_residual[i, pred_ref_idx].detach().cpu().numpy(), pred_size_class[i, pred_ref_idx].detach().cpu().numpy(), pred_size_residual[i, pred_ref_idx].detach().cpu().numpy() ) pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3]) # construct the multiple mask multiple = data_dict["unique_multiple"][i].item() # construct the others mask others = 1 if data_dict["object_cat"][i] == 17 else 0 # store data scanrefer_idx = data_dict["scan_idx"][i].item() pred_data = { "scene_id": scanrefer[scanrefer_idx]["scene_id"], "object_id": scanrefer[scanrefer_idx]["object_id"], "ann_id": scanrefer[scanrefer_idx]["ann_id"], "bbox": pred_bbox.tolist(), "unique_multiple": multiple, "others": others } pred_bboxes.append(pred_data) # dump print("dumping...") pred_path = os.path.join(CONF.PATH.OUTPUT, args.folder, "pred.json") with open(pred_path, "w") as f: json.dump(pred_bboxes, f, indent=4) print("done!")
def __getitem__(self, idx): start = time.time() scene_id = self.scanrefer[idx]["scene_id"] object_id = int(self.scanrefer[idx]["object_id"]) object_name = " ".join(self.scanrefer[idx]["object_name"].split("_")) ann_id = self.scanrefer[idx]["ann_id"] # get language features lang_feat = self.lang[scene_id][str(object_id)][ann_id] lang_len = len(self.scanrefer[idx]["token"]) + 2 lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN + 2 else CONF.TRAIN.MAX_DES_LEN + 2 # get pc mesh_vertices = self.scene_data[scene_id]["mesh_vertices"] instance_labels = self.scene_data[scene_id]["instance_labels"] semantic_labels = self.scene_data[scene_id]["semantic_labels"] instance_bboxes = self.scene_data[scene_id]["instance_bboxes"] if not self.use_color: point_cloud = mesh_vertices[:, 0:3] # do not use color for now pcl_color = mesh_vertices[:, 3:6] else: point_cloud = mesh_vertices[:, 0:6] point_cloud[:, 3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0 pcl_color = point_cloud[:, 3:6] if self.use_normal: normals = mesh_vertices[:, 6:9] point_cloud = np.concatenate([point_cloud, normals], 1) if self.use_multiview: # load multiview database if self.multiview_data == {}: self.multiview_data = h5py.File(MULTIVIEW_DATA, "r", libver="latest") multiview = self.multiview_data[scene_id] point_cloud = np.concatenate([point_cloud, multiview], 1) if self.use_height: floor_height = np.percentile(point_cloud[:, 2], 0.99) height = point_cloud[:, 2] - floor_height point_cloud = np.concatenate( [point_cloud, np.expand_dims(height, 1)], 1) point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] # ------------------------------- LABELS ------------------------------ target_bboxes = np.zeros((MAX_NUM_OBJ, 6)) target_bboxes_mask = np.zeros((MAX_NUM_OBJ)) angle_classes = np.zeros((MAX_NUM_OBJ, )) angle_residuals = np.zeros((MAX_NUM_OBJ, )) size_classes = np.zeros((MAX_NUM_OBJ, )) size_residuals = np.zeros((MAX_NUM_OBJ, 3)) ref_box_label = np.zeros( MAX_NUM_OBJ) # bbox label for reference target ref_center_label = np.zeros(3) # bbox center for reference target ref_heading_class_label = 0 ref_heading_residual_label = 0 ref_size_class_label = 0 ref_size_residual_label = np.zeros( 3) # bbox size residual for reference target ref_box_corner_label = np.zeros((8, 3)) if self.split != "test": num_bbox = instance_bboxes.shape[ 0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ target_bboxes_mask[0:num_bbox] = 1 target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6] point_votes = np.zeros([self.num_points, 3]) point_votes_mask = np.zeros(self.num_points) # ------------------------------- DATA AUGMENTATION ------------------------------ if self.augment: if np.random.random() > 0.5: # Flipping along the YZ plane point_cloud[:, 0] = -1 * point_cloud[:, 0] target_bboxes[:, 0] = -1 * target_bboxes[:, 0] if np.random.random() > 0.5: # Flipping along the XZ plane point_cloud[:, 1] = -1 * point_cloud[:, 1] target_bboxes[:, 1] = -1 * target_bboxes[:, 1] # Rotation along X-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate( point_cloud, target_bboxes) # compute votes *AFTER* augmentation # generate votes # Note: since there's no map between bbox instance labels and # pc instance_labels (it had been filtered # in the data preparation step) we'll compute the instance bbox # from the points sharing the same instance label. for i_instance in np.unique(instance_labels): # find all points belong to that instance ind = np.where(instance_labels == i_instance)[0] # find the semantic label if semantic_labels[ind[0]] in DC.nyu40ids: x = point_cloud[ind, :3] center = 0.5 * (x.min(0) + x.max(0)) point_votes[ind, :] = center - x point_votes_mask[ind] = 1.0 point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical class_ind = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2] ] # NOTE: set size class as semantic class. Consider use size2class. size_classes[0:num_bbox] = class_ind size_residuals[0:num_bbox, :] = target_bboxes[ 0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :] # construct the reference target label for each bbox ref_box_label = np.zeros(MAX_NUM_OBJ) for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]): if gt_id == object_id: ref_box_label[i] = 1 ref_center_label = target_bboxes[i, 0:3] ref_heading_class_label = angle_classes[i] ref_heading_residual_label = angle_residuals[i] ref_size_class_label = size_classes[i] ref_size_residual_label = size_residuals[i] # construct ground truth box corner coordinates ref_obb = DC.param2obb(ref_center_label, ref_heading_class_label, ref_heading_residual_label, ref_size_class_label, ref_size_residual_label) ref_box_corner_label = get_3d_box(ref_obb[3:6], ref_obb[6], ref_obb[0:3]) # construct all GT bbox corners all_obb = DC.param2obb_batch( target_bboxes[:num_bbox, 0:3], angle_classes[:num_bbox].astype(np.int64), angle_residuals[:num_bbox], size_classes[:num_bbox].astype(np.int64), size_residuals[:num_bbox]) all_box_corner_label = get_3d_box_batch(all_obb[:, 3:6], all_obb[:, 6], all_obb[:, 0:3]) # store gt_box_corner_label = np.zeros((MAX_NUM_OBJ, 8, 3)) gt_box_masks = np.zeros((MAX_NUM_OBJ, )) gt_box_object_ids = np.zeros((MAX_NUM_OBJ, )) gt_box_corner_label[:num_bbox] = all_box_corner_label gt_box_masks[:num_bbox] = 1 gt_box_object_ids[:num_bbox] = instance_bboxes[:, -1] else: num_bbox = 1 point_votes = np.zeros([self.num_points, 9]) # make 3 votes identical point_votes_mask = np.zeros(self.num_points) target_bboxes_semcls = np.zeros((MAX_NUM_OBJ)) target_object_ids = np.zeros( (MAX_NUM_OBJ, )) # object ids of all objects try: target_bboxes_semcls[0:num_bbox] = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:, -2][0:num_bbox] ] target_object_ids[0:num_bbox] = instance_bboxes[:, -1][0:num_bbox] except KeyError: pass object_cat = self.raw2label[ object_name] if object_name in self.raw2label else 17 data_dict = {} data_dict["point_clouds"] = point_cloud.astype( np.float32 ) # point cloud data including features [B,max_num_points,3] data_dict["lang_feat"] = lang_feat.astype( np.float32) # language feature vectors [B,32,300] data_dict["lang_len"] = np.array(lang_len).astype( np.int64) # length of each description [B] data_dict["lang_ids"] = np.array( self.lang_ids[scene_id][str(object_id)][ann_id]).astype( np.int64) # [B,32,300] #all data with MAX_NUM_OBJ are mostly filled with zeros data_dict["center_label"] = target_bboxes.astype( np.float32 )[:, 0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ # [B,128,3] data_dict["heading_class_label"] = angle_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1 [B,128] data_dict["heading_residual_label"] = angle_residuals.astype( np.float32) # (MAX_NUM_OBJ,) [B,128] data_dict["size_class_label"] = size_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER [B,128] data_dict["size_residual_label"] = size_residuals.astype( np.float32) # (MAX_NUM_OBJ, 3) [B,128,3] data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64) # [B] data_dict["sem_cls_label"] = target_bboxes_semcls.astype( np.int64) # (MAX_NUM_OBJ,) semantic class index data_dict["scene_object_ids"] = target_object_ids.astype( np.int64) # (MAX_NUM_OBJ,) object ids of all objects data_dict["box_label_mask"] = target_bboxes_mask.astype( np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box data_dict["vote_label"] = point_votes.astype(np.float32) # [B,40000,9] data_dict["vote_label_mask"] = point_votes_mask.astype( np.int64) # [B,40000] data_dict["dataset_idx"] = np.array(idx).astype( np.int64) # [B] object indices from self.scanrefer data_dict["pcl_color"] = pcl_color data_dict["ref_box_label"] = ref_box_label.astype( np.int64) # 0/1 reference labels for each object bbox data_dict["ref_center_label"] = ref_center_label.astype(np.float32) data_dict["ref_heading_class_label"] = np.array( int(ref_heading_class_label)).astype(np.int64) data_dict["ref_heading_residual_label"] = np.array( int(ref_heading_residual_label)).astype(np.int64) data_dict["ref_size_class_label"] = np.array( int(ref_size_class_label)).astype(np.int64) data_dict["ref_size_residual_label"] = ref_size_residual_label.astype( np.float32) data_dict["ref_box_corner_label"] = ref_box_corner_label.astype( np.float64) # target box corners NOTE type must be data_dict["gt_box_corner_label"] = gt_box_corner_label.astype( np.float64) # all GT box corners NOTE type must be double data_dict["gt_box_masks"] = gt_box_masks.astype( np.int64) # valid bbox masks data_dict["gt_box_object_ids"] = gt_box_object_ids.astype( np.int64) # valid bbox object ids data_dict["object_id"] = np.array(int(object_id)).astype( np.int64) # [B] target object_ids data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64) # [B] data_dict["object_cat"] = np.array(object_cat).astype( np.int64) # [B] target object classes data_dict["unique_multiple"] = np.array( self.unique_multiple_lookup[scene_id][str( object_id)][ann_id]).astype(np.int64) data_dict["pcl_color"] = pcl_color # [B,40000,3] data_dict["load_time"] = time.time() - start return data_dict
def get_loss(data_dict, config, reference=False, use_lang_classifier=False, use_max_iou=False, post_processing=None): """ Loss functions Args: data_dict: dict config: dataset config instance reference: flag (False/True) post_processing: config dict Returns: loss: pytorch scalar tensor data_dict: dict """ # Vote loss vote_loss = compute_vote_loss(data_dict) data_dict['vote_loss'] = vote_loss # Obj loss objectness_loss, objectness_label, objectness_mask, object_assignment = compute_objectness_loss( data_dict) data_dict['objectness_loss'] = objectness_loss data_dict['objectness_label'] = objectness_label data_dict['objectness_mask'] = objectness_mask data_dict['object_assignment'] = object_assignment total_num_proposal = objectness_label.shape[0] * objectness_label.shape[1] data_dict['pos_ratio'] = torch.sum( objectness_label.float().cuda()) / float(total_num_proposal) data_dict['neg_ratio'] = torch.sum(objectness_mask.float()) / float( total_num_proposal) - data_dict['pos_ratio'] # Box loss and sem cls loss center_loss, heading_cls_loss, heading_reg_loss, size_cls_loss, size_reg_loss, sem_cls_loss = compute_box_and_sem_cls_loss( data_dict, config) data_dict['center_loss'] = center_loss data_dict['heading_cls_loss'] = heading_cls_loss data_dict['heading_reg_loss'] = heading_reg_loss data_dict['size_cls_loss'] = size_cls_loss data_dict['size_reg_loss'] = size_reg_loss data_dict['sem_cls_loss'] = sem_cls_loss box_loss = center_loss + 0.1 * heading_cls_loss + heading_reg_loss + 0.1 * size_cls_loss + size_reg_loss data_dict['box_loss'] = box_loss if reference: # Reference loss ref_loss, lang_loss, cluster_preds_scores, cluster_labels = compute_reference_loss( data_dict, config, use_lang_classifier, use_max_iou) data_dict["ref_loss"] = ref_loss data_dict["lang_loss"] = lang_loss objectness_preds_batch = torch.argmax(data_dict['objectness_scores'], 2).long() objectness_labels_batch = objectness_label.long() if post_processing: _ = parse_predictions(data_dict, post_processing) nms_masks = torch.LongTensor(data_dict['pred_mask']).cuda() # construct valid mask pred_masks = (nms_masks * objectness_preds_batch == 1).float() label_masks = (objectness_labels_batch == 1).float() else: # construct valid mask pred_masks = (objectness_preds_batch == 1).float() label_masks = (objectness_labels_batch == 1).float() data_dict["pred_mask"] = pred_masks data_dict["label_mask"] = label_masks cluster_preds = torch.argmax(cluster_preds_scores * pred_masks, 1).long().unsqueeze(1).repeat( 1, pred_masks.shape[1]) preds = torch.zeros(pred_masks.shape).cuda() preds = preds.scatter_(1, cluster_preds, 1) cluster_preds = preds cluster_labels = cluster_labels.float() cluster_labels *= label_masks # compute classification scores corrects = torch.sum((cluster_preds == 1) * (cluster_labels == 1), dim=1).float() labels = torch.ones(corrects.shape[0]).cuda() ref_acc = corrects / (labels + 1e-8) # store data_dict["ref_acc"] = ref_acc.cpu().numpy().tolist() # compute localization metrics pred_ref = torch.argmax( data_dict['cluster_ref'] * data_dict['pred_mask'], 1).detach().cpu().numpy() # (B,) pred_center = data_dict['center'].detach().cpu().numpy() # (B,K,3) pred_heading_class = torch.argmax(data_dict['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather( data_dict['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_class = pred_heading_class.detach().cpu().numpy( ) # B,num_proposal pred_heading_residual = pred_heading_residual.squeeze( 2).detach().cpu().numpy() # B,num_proposal pred_size_class = torch.argmax(data_dict['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather( data_dict['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat( 1, 1, 1, 3)) # B,num_proposal,1,3 pred_size_class = pred_size_class.detach().cpu().numpy() pred_size_residual = pred_size_residual.squeeze( 2).detach().cpu().numpy() # B,num_proposal,3 gt_ref = torch.argmax(data_dict["ref_box_label"], 1).detach().cpu().numpy() gt_center = data_dict['center_label'].cpu().numpy( ) # (B,MAX_NUM_OBJ,3) gt_heading_class = data_dict['heading_class_label'].cpu().numpy( ) # B,K2 gt_heading_residual = data_dict['heading_residual_label'].cpu().numpy( ) # B,K2 gt_size_class = data_dict['size_class_label'].cpu().numpy() # B,K2 gt_size_residual = data_dict['size_residual_label'].cpu().numpy( ) # B,K2,3 ious = [] multiple = [] for i in range(pred_ref.shape[0]): # compute the iou pred_ref_idx, gt_ref_idx = pred_ref[i], gt_ref[i] pred_obb = config.param2obb(pred_center[i, pred_ref_idx, 0:3], pred_heading_class[i, pred_ref_idx], pred_heading_residual[i, pred_ref_idx], pred_size_class[i, pred_ref_idx], pred_size_residual[i, pred_ref_idx]) gt_obb = config.param2obb(gt_center[i, gt_ref_idx, 0:3], gt_heading_class[i, gt_ref_idx], gt_heading_residual[i, gt_ref_idx], gt_size_class[i, gt_ref_idx], gt_size_residual[i, gt_ref_idx]) pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3]) gt_bbox = get_3d_box(gt_obb[3:6], gt_obb[6], gt_obb[0:3]) iou, _ = box3d_iou(pred_bbox, gt_bbox) ious.append(iou) # construct the multiple mask num_bbox = data_dict["num_bbox"][i] sem_cls_label = data_dict["sem_cls_label"][i] sem_cls_label[num_bbox:] -= 1 num_choices = torch.sum( data_dict["object_cat"][i] == sem_cls_label) if num_choices > 1: multiple.append(1) else: multiple.append(0) # store data_dict["ref_iou"] = ious data_dict["ref_iou_rate_0.25"] = np.array(ious)[ np.array(ious) >= 0.25].shape[0] / np.array(ious).shape[0] data_dict["ref_iou_rate_0.5"] = np.array(ious)[ np.array(ious) >= 0.5].shape[0] / np.array(ious).shape[0] data_dict["ref_multiple_mask"] = multiple else: ref_loss = torch.zeros(1)[0].cuda() lang_loss = torch.zeros(1)[0].cuda() # Final loss function if use_max_iou: loss = vote_loss + 0.5 * objectness_loss + box_loss + 0.1 * sem_cls_loss + 0.1 * ref_loss + lang_loss else: loss = vote_loss + 0.5 * objectness_loss + box_loss + 0.1 * sem_cls_loss + 0.01 * ref_loss + lang_loss loss *= 10 # amplify data_dict['loss'] = loss # -------------------------------------------- # Some other statistics obj_pred_val = torch.argmax(data_dict['objectness_scores'], 2) # B,K obj_acc = torch.sum((obj_pred_val == objectness_label.long()).float() * objectness_mask) / (torch.sum(objectness_mask) + 1e-6) data_dict['obj_acc'] = obj_acc # precision, recall, f1 corrects = torch.sum((obj_pred_val == 1) * (objectness_label == 1), dim=1).float() preds = torch.sum(obj_pred_val == 1, dim=1).float() labels = torch.sum(objectness_label == 1, dim=1).float() precisions = corrects / (labels + 1e-8) recalls = corrects / (preds + 1e-8) f1s = 2 * precisions * recalls / (precisions + recalls + 1e-8) data_dict["objectness_precision"] = precisions.cpu().numpy().tolist() data_dict["objectness_recall"] = recalls.cpu().numpy().tolist() data_dict["objectness_f1"] = f1s.cpu().numpy().tolist() # lang if use_lang_classifier: data_dict["lang_acc"] = (torch.argmax( data_dict['lang_scores'], 1) == data_dict["object_cat"]).float().mean() else: data_dict["lang_acc"] = torch.zeros(1)[0].cuda() return loss, data_dict
def get_eval(data_dict, config, reference, use_lang_classifier=False, use_oracle=False, use_cat_rand=False, use_best=False, post_processing=None): """ Loss functions Args: data_dict: dict config: dataset config instance reference: flag (False/True) post_processing: config dict Returns: loss: pytorch scalar tensor data_dict: dict """ batch_size, num_words, _ = data_dict["lang_feat"].shape objectness_preds_batch = torch.argmax(data_dict['objectness_scores'], 2).long() objectness_labels_batch = data_dict['objectness_label'].long() if post_processing: _ = parse_predictions(data_dict, post_processing) nms_masks = torch.LongTensor(data_dict['pred_mask']).cuda() # construct valid mask pred_masks = (nms_masks * objectness_preds_batch == 1).float() label_masks = (objectness_labels_batch == 1).float() else: # construct valid mask pred_masks = (objectness_preds_batch == 1).float() label_masks = (objectness_labels_batch == 1).float() cluster_preds = torch.argmax(data_dict["cluster_ref"] * pred_masks, 1).long().unsqueeze(1).repeat( 1, pred_masks.shape[1]) preds = torch.zeros(pred_masks.shape).cuda() preds = preds.scatter_(1, cluster_preds, 1) cluster_preds = preds cluster_labels = data_dict["cluster_labels"].float() cluster_labels *= label_masks # compute classification scores corrects = torch.sum((cluster_preds == 1) * (cluster_labels == 1), dim=1).float() labels = torch.ones(corrects.shape[0]).cuda() ref_acc = corrects / (labels + 1e-8) # store data_dict["ref_acc"] = ref_acc.cpu().numpy().tolist() # compute localization metrics if use_best: pred_ref = torch.argmax(data_dict["cluster_labels"], 1) # (B,) # store the calibrated predictions and masks data_dict['cluster_ref'] = data_dict["cluster_labels"] if use_cat_rand: cluster_preds = torch.zeros(cluster_labels.shape).cuda() for i in range(cluster_preds.shape[0]): num_bbox = data_dict["num_bbox"][i] sem_cls_label = data_dict["sem_cls_label"][i] # sem_cls_label = torch.argmax(end_points["sem_cls_scores"], 2)[i] sem_cls_label[num_bbox:] -= 1 candidate_masks = torch.gather( sem_cls_label == data_dict["object_cat"][i], 0, data_dict["object_assignment"][i]) candidates = torch.arange(cluster_labels.shape[1])[candidate_masks] try: chosen_idx = torch.randperm(candidates.shape[0])[0] chosen_candidate = candidates[chosen_idx] cluster_preds[i, chosen_candidate] = 1 except IndexError: cluster_preds[i, candidates] = 1 pred_ref = torch.argmax(cluster_preds, 1) # (B,) # store the calibrated predictions and masks data_dict['cluster_ref'] = cluster_preds else: pred_ref = torch.argmax(data_dict['cluster_ref'] * pred_masks, 1) # (B,) # store the calibrated predictions and masks data_dict['cluster_ref'] = data_dict['cluster_ref'] * pred_masks if use_oracle: pred_center = data_dict['center_label'] # (B,MAX_NUM_OBJ,3) pred_heading_class = data_dict['heading_class_label'] # B,K2 pred_heading_residual = data_dict['heading_residual_label'] # B,K2 pred_size_class = data_dict['size_class_label'] # B,K2 pred_size_residual = data_dict['size_residual_label'] # B,K2,3 # assign pred_center = torch.gather( pred_center, 1, data_dict["object_assignment"].unsqueeze(2).repeat(1, 1, 3)) pred_heading_class = torch.gather(pred_heading_class, 1, data_dict["object_assignment"]) pred_heading_residual = torch.gather( pred_heading_residual, 1, data_dict["object_assignment"]).unsqueeze(-1) pred_size_class = torch.gather(pred_size_class, 1, data_dict["object_assignment"]) pred_size_residual = torch.gather( pred_size_residual, 1, data_dict["object_assignment"].unsqueeze(2).repeat(1, 1, 3)) else: pred_center = data_dict['center'] # (B,K,3) pred_heading_class = torch.argmax(data_dict['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather( data_dict['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_class = pred_heading_class # B,num_proposal pred_heading_residual = pred_heading_residual.squeeze( 2) # B,num_proposal pred_size_class = torch.argmax(data_dict['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather( data_dict['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat( 1, 1, 1, 3)) # B,num_proposal,1,3 pred_size_class = pred_size_class pred_size_residual = pred_size_residual.squeeze(2) # B,num_proposal,3 # store data_dict["pred_mask"] = pred_masks data_dict["label_mask"] = label_masks data_dict['pred_center'] = pred_center data_dict['pred_heading_class'] = pred_heading_class data_dict['pred_heading_residual'] = pred_heading_residual data_dict['pred_size_class'] = pred_size_class data_dict['pred_size_residual'] = pred_size_residual gt_ref = torch.argmax(data_dict["ref_box_label"], 1) gt_center = data_dict['center_label'] # (B,MAX_NUM_OBJ,3) gt_heading_class = data_dict['heading_class_label'] # B,K2 gt_heading_residual = data_dict['heading_residual_label'] # B,K2 gt_size_class = data_dict['size_class_label'] # B,K2 gt_size_residual = data_dict['size_residual_label'] # B,K2,3 ious = [] multiple = [] others = [] pred_bboxes = [] gt_bboxes = [] for i in range(pred_ref.shape[0]): # compute the iou pred_ref_idx, gt_ref_idx = pred_ref[i], gt_ref[i] pred_obb = config.param2obb( pred_center[i, pred_ref_idx, 0:3].detach().cpu().numpy(), pred_heading_class[i, pred_ref_idx].detach().cpu().numpy(), pred_heading_residual[i, pred_ref_idx].detach().cpu().numpy(), pred_size_class[i, pred_ref_idx].detach().cpu().numpy(), pred_size_residual[i, pred_ref_idx].detach().cpu().numpy()) gt_obb = config.param2obb( gt_center[i, gt_ref_idx, 0:3].detach().cpu().numpy(), gt_heading_class[i, gt_ref_idx].detach().cpu().numpy(), gt_heading_residual[i, gt_ref_idx].detach().cpu().numpy(), gt_size_class[i, gt_ref_idx].detach().cpu().numpy(), gt_size_residual[i, gt_ref_idx].detach().cpu().numpy()) pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3]) gt_bbox = get_3d_box(gt_obb[3:6], gt_obb[6], gt_obb[0:3]) iou = eval_ref_one_sample(pred_bbox, gt_bbox) ious.append(iou) # NOTE: get_3d_box() will return problematic bboxes pred_bbox = construct_bbox_corners(pred_obb[0:3], pred_obb[3:6]) gt_bbox = construct_bbox_corners(gt_obb[0:3], gt_obb[3:6]) pred_bboxes.append(pred_bbox) gt_bboxes.append(gt_bbox) # construct the multiple mask multiple.append(data_dict["unique_multiple"][i].item()) # construct the others mask flag = 1 if data_dict["object_cat"][i] == 17 else 0 others.append(flag) # lang if reference and use_lang_classifier: data_dict["lang_acc"] = (torch.argmax( data_dict['lang_scores'], 1) == data_dict["object_cat"]).float().mean() else: data_dict["lang_acc"] = torch.zeros(1)[0].cuda() # store data_dict["ref_iou"] = ious data_dict["ref_iou_rate_0.25"] = np.array(ious)[ np.array(ious) >= 0.25].shape[0] / np.array(ious).shape[0] data_dict["ref_iou_rate_0.5"] = np.array(ious)[ np.array(ious) >= 0.5].shape[0] / np.array(ious).shape[0] data_dict["ref_multiple_mask"] = multiple data_dict["ref_others_mask"] = others data_dict["pred_bboxes"] = pred_bboxes data_dict["gt_bboxes"] = gt_bboxes # -------------------------------------------- # Some other statistics obj_pred_val = torch.argmax(data_dict['objectness_scores'], 2) # B,K obj_acc = torch.sum( (obj_pred_val == data_dict['objectness_label'].long()).float() * data_dict['objectness_mask']) / ( torch.sum(data_dict['objectness_mask']) + 1e-6) data_dict['obj_acc'] = obj_acc # detection semantic classification sem_cls_label = torch.gather( data_dict['sem_cls_label'], 1, data_dict['object_assignment']) # select (B,K) from (B,K2) sem_cls_pred = data_dict['sem_cls_scores'].argmax(-1) # (B,K) sem_match = (sem_cls_label == sem_cls_pred).float() data_dict["sem_acc"] = (sem_match * data_dict["pred_mask"] ).sum() / data_dict["pred_mask"].sum() return data_dict
def parse_predictions(end_points, config_dict): """ Parse predictions to OBB parameters and suppress overlapping boxes Args: end_points: dict {point_clouds, center, heading_scores, heading_residuals, size_scores, size_residuals, sem_cls_scores} config_dict: dict {dataset_config, remove_empty_box, use_3d_nms, nms_iou, use_old_type_nms, conf_thresh, per_class_proposal} Returns: batch_pred_map_cls: a list of len == batch size (BS) [pred_list_i], i = 0, 1, ..., BS-1 where pred_list_i = [(pred_sem_cls, box_params, box_score)_j] where j = 0, ..., num of valid detections - 1 from sample input i """ pred_center = end_points['center'] # B,num_proposal,3 pred_heading_class = torch.argmax(end_points['heading_scores'], -1) # B,num_proposal pred_heading_residual = torch.gather( end_points['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1 pred_heading_residual.squeeze_(2) pred_size_class = torch.argmax(end_points['size_scores'], -1) # B,num_proposal pred_size_residual = torch.gather( end_points['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat( 1, 1, 1, 3)) # B,num_proposal,1,3 pred_size_residual.squeeze_(2) pred_sem_cls = torch.argmax(end_points['sem_cls_scores'], -1) # B,num_proposal sem_cls_probs = softmax(end_points['sem_cls_scores'].detach().cpu().numpy( )) # B,num_proposal,10 pred_sem_cls_prob = np.max(sem_cls_probs, -1) # B,num_proposal num_proposal = pred_center.shape[1] # Since we operate in upright_depth coord for points, while util functions # assume upright_camera coord. bsize = pred_center.shape[0] pred_corners_3d_upright_camera = np.zeros((bsize, num_proposal, 8, 3)) # pred_center_upright_camera = flip_axis_to_camera(pred_center.detach().cpu().numpy()) pred_center_upright_camera = pred_center.detach().cpu().numpy() for i in range(bsize): for j in range(num_proposal): heading_angle = config_dict['dataset_config'].class2angle(\ pred_heading_class[i,j].detach().cpu().numpy(), pred_heading_residual[i,j].detach().cpu().numpy()) box_size = config_dict['dataset_config'].class2size(\ int(pred_size_class[i,j].detach().cpu().numpy()), pred_size_residual[i,j].detach().cpu().numpy()) corners_3d_upright_camera = get_3d_box( box_size, heading_angle, pred_center_upright_camera[i, j, :]) pred_corners_3d_upright_camera[i, j] = corners_3d_upright_camera K = pred_center.shape[1] # K==num_proposal nonempty_box_mask = np.ones((bsize, K)) if config_dict['remove_empty_box']: # ------------------------------------- # Remove predicted boxes without any point within them.. batch_pc = end_points['point_clouds'].cpu().numpy()[:, :, 0:3] # B,N,3 for i in range(bsize): pc = batch_pc[i, :, :] # (N,3) for j in range(K): box3d = pred_corners_3d_upright_camera[i, j, :, :] # (8,3) # box3d = flip_axis_to_depth(box3d) pc_in_box, inds = extract_pc_in_box3d(pc, box3d) if len(pc_in_box) < 5: nonempty_box_mask[i, j] = 0 # ------------------------------------- obj_logits = end_points['objectness_scores'].detach().cpu().numpy() obj_prob = softmax(obj_logits)[:, :, 1] # (B,K) if not config_dict['use_3d_nms']: # ---------- NMS input: pred_with_prob in (B,K,7) ----------- pred_mask = np.zeros((bsize, K)) for i in range(bsize): boxes_2d_with_prob = np.zeros((K, 5)) for j in range(K): boxes_2d_with_prob[j, 0] = np.min( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_2d_with_prob[j, 2] = np.max( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_2d_with_prob[j, 1] = np.min( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_2d_with_prob[j, 3] = np.max( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_2d_with_prob[j, 4] = obj_prob[i, j] nonempty_box_inds = np.where(nonempty_box_mask[i, :] == 1)[0] pick = nms_2d_faster( boxes_2d_with_prob[nonempty_box_mask[i, :] == 1, :], config_dict['nms_iou'], config_dict['use_old_type_nms']) assert (len(pick) > 0) pred_mask[i, nonempty_box_inds[pick]] = 1 end_points['pred_mask'] = pred_mask # ---------- NMS output: pred_mask in (B,K) ----------- elif config_dict['use_3d_nms'] and (not config_dict['cls_nms']): # ---------- NMS input: pred_with_prob in (B,K,7) ----------- pred_mask = np.zeros((bsize, K)) for i in range(bsize): boxes_3d_with_prob = np.zeros((K, 7)) for j in range(K): boxes_3d_with_prob[j, 0] = np.min( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 1] = np.min( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 2] = np.min( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 3] = np.max( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 4] = np.max( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 5] = np.max( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 6] = obj_prob[i, j] nonempty_box_inds = np.where(nonempty_box_mask[i, :] == 1)[0] pick = nms_3d_faster( boxes_3d_with_prob[nonempty_box_mask[i, :] == 1, :], config_dict['nms_iou'], config_dict['use_old_type_nms']) assert (len(pick) > 0) pred_mask[i, nonempty_box_inds[pick]] = 1 end_points['pred_mask'] = pred_mask # ---------- NMS output: pred_mask in (B,K) ----------- elif config_dict['use_3d_nms'] and config_dict['cls_nms']: # ---------- NMS input: pred_with_prob in (B,K,8) ----------- pred_mask = np.zeros((bsize, K)) for i in range(bsize): boxes_3d_with_prob = np.zeros((K, 8)) for j in range(K): boxes_3d_with_prob[j, 0] = np.min( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 1] = np.min( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 2] = np.min( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 3] = np.max( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 4] = np.max( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 5] = np.max( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 6] = obj_prob[i, j] boxes_3d_with_prob[j, 7] = pred_sem_cls[ i, j] # only suppress if the two boxes are of the same class!! nonempty_box_inds = np.where(nonempty_box_mask[i, :] == 1)[0] pick = nms_3d_faster_samecls( boxes_3d_with_prob[nonempty_box_mask[i, :] == 1, :], config_dict['nms_iou'], config_dict['use_old_type_nms']) assert (len(pick) > 0) pred_mask[i, nonempty_box_inds[pick]] = 1 end_points['pred_mask'] = pred_mask # ---------- NMS output: pred_mask in (B,K) ----------- batch_pred_map_cls = [ ] # a list (len: batch_size) of list (len: num of predictions per sample) of tuples of pred_cls, pred_box and conf (0-1) for i in range(bsize): if config_dict['per_class_proposal']: cur_list = [] for ii in range(config_dict['dataset_config'].num_class): cur_list += [(ii, pred_corners_3d_upright_camera[i,j], sem_cls_probs[i,j,ii]*obj_prob[i,j]) \ for j in range(pred_center.shape[1]) if pred_mask[i,j]==1 and obj_prob[i,j]>config_dict['conf_thresh']] batch_pred_map_cls.append(cur_list) else: batch_pred_map_cls.append([(pred_sem_cls[i,j].item(), pred_corners_3d_upright_camera[i,j], obj_prob[i,j]) \ for j in range(pred_center.shape[1]) if pred_mask[i,j]==1 and obj_prob[i,j]>config_dict['conf_thresh']]) end_points['batch_pred_map_cls'] = batch_pred_map_cls return batch_pred_map_cls
def get_pseudo_labels(end_points, ema_end_points, pred_center, pred_sem_cls, pred_objectness, pred_heading_scores, pred_heading_residuals, pred_size_scores, pred_size_residuals, pred_vote_xyz, config_dict): batch_size, num_proposal = pred_center.shape[:2] label_mask = torch.zeros((batch_size, MAX_NUM_OBJ), dtype=torch.long).cuda() # obj score threshold pred_objectness = nn.Softmax(dim=2)(pred_objectness) # the second element is positive score pos_obj = pred_objectness[:, :, 1] neg_obj = pred_objectness[:, :, 0] objectness_mask = pos_obj > config_dict['obj_threshold'] neg_objectness_mask = neg_obj > 0.9 # deprecated # cls score threshold pred_sem_cls = nn.Softmax(dim=2)(pred_sem_cls) max_cls, argmax_cls = torch.max(pred_sem_cls, dim=2) cls_mask = max_cls > config_dict['cls_threshold'] supervised_mask = end_points['supervised_mask'] unsupervised_inds = torch.nonzero(1 - supervised_mask).squeeze(1).long() iou_pred = nn.Sigmoid()(ema_end_points['iou_scores'][unsupervised_inds, ...]) if iou_pred.shape[2] > 1: iou_pred = torch.gather(iou_pred, 2, argmax_cls.unsqueeze(-1)).squeeze( -1) # use pred semantic labels else: iou_pred = iou_pred.squeeze(-1) if config_dict['view_stats']: # GT IoU labels (cheating) only for analyzing performance iou_labels, objectness_label, object_assignment = compute_iou_labels( end_points, unsupervised_inds, pred_vote_xyz, pred_center, pred_sem_cls, pred_objectness, pred_heading_scores, pred_heading_residuals, pred_size_scores, pred_size_residuals, config_dict) end_points['unlabeled_iou_labels'] = iou_labels end_points['unlabeled_pred_iou_value'] = torch.sum( iou_labels) / iou_labels.view(-1).shape[0] end_points['unlabeled_pred_iou_obj_value'] = torch.sum( iou_labels * objectness_label) / (torch.sum(objectness_label) + 1e-6) iou_acc = torch.abs(iou_pred - iou_labels) end_points['unlabeled_iou_acc'] = torch.sum(iou_acc) / iou_acc.view( -1).shape[0] obj_true_num = (torch.sum(objectness_label) + 1e-6) end_points['unlabeled_iou_obj_acc'] = torch.sum( iou_acc * objectness_label) / obj_true_num # for coverage calculation, associates every gt with pseudo labels gt_to_pseudo_iou = compute_iou_labels(end_points, unsupervised_inds, pred_vote_xyz, pred_center, pred_sem_cls, pred_objectness, pred_heading_scores, pred_heading_residuals, pred_size_scores, pred_size_residuals, config_dict, reverse=True) iou_threshold = config_dict['iou_threshold'] iou_mask = iou_pred > iou_threshold before_iou_mask = torch.logical_and(cls_mask, objectness_mask) final_mask = torch.logical_and(before_iou_mask, iou_mask) # we only keep MAX_NUM_OBJ predictions # however, after filtering the number can still exceed this # so we keep the ones with larger pos_obj * max_cls inds = torch.argsort(pos_obj * max_cls * final_mask, dim=1, descending=True) inds = inds[:, :MAX_NUM_OBJ].long() final_mask_sorted = torch.gather(final_mask, dim=1, index=inds) end_points['pseudo_gt_ratio'] = torch.sum( final_mask_sorted).float() / final_mask_sorted.view(-1).shape[0] neg_objectness_mask = torch.gather(neg_objectness_mask, dim=1, index=inds) max_size, argmax_size = torch.max(pred_size_scores, dim=2) size_inds = argmax_size.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, 3) max_heading, argmax_heading = torch.max(pred_heading_scores, dim=2) heading_inds = argmax_heading.unsqueeze(-1) # now only one class residuals pred_heading_residuals = torch.gather(pred_heading_residuals, dim=2, index=heading_inds).squeeze(2) pred_size_residuals = torch.gather(pred_size_residuals, dim=2, index=size_inds).squeeze(2) if config_dict['use_lhs']: pred_center_ = torch.gather(pred_center, dim=1, index=inds.unsqueeze(-1).expand(-1, -1, 3)) pred_heading_class_ = torch.gather(argmax_heading, dim=1, index=inds) pred_heading_residual_ = torch.gather(pred_heading_residuals, dim=1, index=inds) pred_size_class_ = torch.gather(argmax_size, dim=1, index=inds) pred_size_residual_ = torch.gather(pred_size_residuals, dim=1, index=inds.unsqueeze(-1).expand( -1, -1, 3)) num_proposal = pred_center_.shape[1] bsize = pred_center_.shape[0] pred_box_parameters = np.zeros((bsize, num_proposal, 7), dtype=np.float32) pred_box_parameters[:, :, 0:3] = pred_center_.detach().cpu().numpy() pred_corners_3d_upright_camera = np.zeros((bsize, num_proposal, 8, 3), dtype=np.float32) pred_center_upright_camera = flip_axis_to_camera( pred_center_.detach().cpu().numpy()) for i in range(bsize): for j in range(num_proposal): heading_angle = config_dict['dataset_config'].class2angle( \ pred_heading_class_[i, j].detach().cpu().numpy(), pred_heading_residual_[i, j].detach().cpu().numpy()) box_size = config_dict['dataset_config'].class2size( \ int(pred_size_class_[i, j].detach().cpu().numpy()), pred_size_residual_[i, j].detach().cpu().numpy()) pred_box_parameters[i, j, 3:6] = box_size pred_box_parameters[i, j, 6] = heading_angle corners_3d_upright_camera = get_3d_box( box_size, heading_angle, pred_center_upright_camera[i, j, :]) pred_corners_3d_upright_camera[i, j] = corners_3d_upright_camera # pred_corners_3d_upright_camera, _ = predictions2corners3d(end_points, config_dict) pred_mask = np.ones((batch_size, MAX_NUM_OBJ)) nonempty_box_mask = np.ones((batch_size, MAX_NUM_OBJ)) pos_obj_numpy = torch.gather(pos_obj, dim=1, index=inds).detach().cpu().numpy() pred_sem_cls_numpy = torch.gather(argmax_cls, dim=1, index=inds).detach().cpu().numpy() iou_numpy = torch.gather(iou_pred, dim=1, index=inds).detach().cpu().numpy() for i in range(batch_size): boxes_3d_with_prob = np.zeros((MAX_NUM_OBJ, 8)) for j in range(MAX_NUM_OBJ): boxes_3d_with_prob[j, 0] = np.min( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 1] = np.min( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 2] = np.min( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 3] = np.max( pred_corners_3d_upright_camera[i, j, :, 0]) boxes_3d_with_prob[j, 4] = np.max( pred_corners_3d_upright_camera[i, j, :, 1]) boxes_3d_with_prob[j, 5] = np.max( pred_corners_3d_upright_camera[i, j, :, 2]) boxes_3d_with_prob[j, 6] = pos_obj_numpy[i, j] * iou_numpy[i, j] boxes_3d_with_prob[j, 7] = pred_sem_cls_numpy[ i, j] # only suppress if the two boxes are of the same class!! nonempty_box_inds = np.where(nonempty_box_mask[i, :] == 1)[0] # here we do not consider orientation, in accordance to test time nms pick = lhs_3d_faster_samecls( boxes_3d_with_prob[nonempty_box_mask[i, :] == 1, :], config_dict['nms_iou'], config_dict['use_old_type_nms']) assert (len(pick) > 0) pred_mask[i, nonempty_box_inds[pick]] = 0 # end_points['pred_mask'] = pred_mask final_mask_sorted[torch.from_numpy(pred_mask).bool().cuda()] = 0 if config_dict['view_stats']: # ground truth coverage calculation selected_objectness_label = torch.gather(objectness_label, dim=1, index=inds) selected_object_assignment = torch.gather(object_assignment, dim=1, index=inds) gt_count = end_points['box_label_mask'].sum() picked_iou_labels = torch.gather(iou_labels, dim=1, index=inds) end_points['final_iou_avg_value'] = torch.sum( picked_iou_labels * final_mask_sorted).float() / (torch.sum(final_mask_sorted) + 1e-6) end_points['final_iou_avg_obj_value'] = torch.sum( picked_iou_labels * final_mask_sorted * selected_objectness_label).float() / (torch.sum( final_mask_sorted * selected_objectness_label) + 1e-6) selected_cls_pred = torch.gather(argmax_cls, dim=1, index=inds) selected_cls_gt = torch.gather( end_points['sem_cls_label'][unsupervised_inds, ...], dim=1, index=selected_object_assignment) correct_cls = selected_cls_pred == selected_cls_gt end_points['final_cls_value'] = torch.sum( correct_cls * final_mask_sorted).float() / (torch.sum(final_mask_sorted) + 1e-6) end_points['final_cls_obj_value'] = torch.sum( correct_cls * final_mask_sorted * selected_objectness_label).float() / (torch.sum( final_mask_sorted * selected_objectness_label) + 1e-6) gt_to_pseudo_iou = torch.gather(gt_to_pseudo_iou, dim=2, index=inds.unsqueeze(1).expand( -1, 64, -1)) gt_to_pseudo_iou = gt_to_pseudo_iou * final_mask_sorted.unsqueeze(1) gt_to_pseudo_iou = gt_to_pseudo_iou.max(dim=2)[0] range_25 = (gt_to_pseudo_iou > 0.25).float() range_5 = (gt_to_pseudo_iou > 0.5).float() end_points['final_coverage_0.25_value'] = torch.sum( range_25) / gt_count end_points['final_coverage_0.5_value'] = torch.sum(range_5) / gt_count label_mask[final_mask_sorted] = 1 heading_label = torch.gather(argmax_heading, dim=1, index=inds) heading_residual_label = torch.gather(pred_heading_residuals.squeeze(-1), dim=1, index=inds) size_label = torch.gather(argmax_size, dim=1, index=inds) size_residual_label = torch.gather(pred_size_residuals, dim=1, index=inds.unsqueeze(-1).expand( -1, -1, 3)) sem_cls_label = torch.gather(argmax_cls, dim=1, index=inds) center_label = torch.gather(pred_center, dim=1, index=inds.unsqueeze(-1).expand(-1, -1, 3)) center_label[(1 - label_mask).unsqueeze(-1).expand(-1, -1, 3).bool()] = -1000 false_center_label = torch.gather(pred_vote_xyz, dim=1, index=inds.unsqueeze(-1).expand( -1, -1, 3)) false_center_label[torch.logical_not(neg_objectness_mask).unsqueeze( -1).expand(-1, -1, 3).bool()] = -1000 iou_label = torch.gather(iou_pred, dim=1, index=inds) return label_mask, center_label, sem_cls_label, heading_label, heading_residual_label, size_label, size_residual_label, false_center_label, iou_label