def __getitem__(self, idx): start = time.time() scene_id = self.scanrefer[idx]["scene_id"] object_id = int(self.scanrefer[idx]["object_id"]) object_name = " ".join(self.scanrefer[idx]["object_name"].split("_")) ann_id = self.scanrefer[idx]["ann_id"] # get language features lang_feat = self.lang[scene_id][str(object_id)][ann_id] lang_len = len(self.scanrefer[idx]["token"]) lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN # get pc mesh_vertices = self.scene_data[scene_id]["mesh_vertices"] instance_labels = self.scene_data[scene_id]["instance_labels"] semantic_labels = self.scene_data[scene_id]["semantic_labels"] instance_bboxes = self.scene_data[scene_id]["instance_bboxes"] point_cloud,pcl_color = self.process_pc(mesh_vertices,scene_id) if self.cp_aug and self.split != 'test': # Choose examples from other scene num_obj_add = 32 - instance_bboxes.shape[0] for i in range(num_obj_add): idx_other = random.randint(0,len(self.scanrefer)-1) while idx_other== idx: idx_other = random.randint(0,len(self.scanrefer)-1) try: other_scene_id = self.scanrefer[idx_other]["scene_id"] other_object_id = int(self.scanrefer[idx_other]["object_id"]) other_object_name = " ".join(self.scanrefer[idx_other]["object_name"].split("_")) other_ann_id = self.scanrefer[idx_other]["ann_id"] except IndexError: print("Index Error: Selecting an index out of range") # get pc other_mesh_vertices = self.scene_data[other_scene_id]["mesh_vertices"] other_instance_labels = self.scene_data[other_scene_id]["instance_labels"] other_semantic_labels = self.scene_data[other_scene_id]["semantic_labels"] other_instance_bboxes = self.scene_data[other_scene_id]["instance_bboxes"] other_point_cloud,other_pcl_color = self.process_pc(other_mesh_vertices,scene_id) jitter_idx = 1#random.random()*0.45 +0.8 # Standard scale jittering # Random pick object and append to the current scene target_obj_label = random.randint(0,np.max(other_instance_labels)) # Find object based on id test_instance_labels, test_choices, flag_exceed = choose_label_pc(other_instance_labels, target_obj_label, 200, return_choices=True) test_instance_labels = np.empty(test_instance_labels.shape) other_point_cloud_jitter = other_point_cloud[test_choices].copy() # other_point_cloud_jitter[:,0:3] *= jitter_idx point_cloud = np.concatenate((point_cloud,other_point_cloud_jitter),axis=0) semantic_labels = np.concatenate((semantic_labels,other_semantic_labels[test_choices]),axis=0) pcl_color = np.concatenate((pcl_color,other_pcl_color[test_choices]),axis=0) # Find the right box in other scene flag_add_instance = 0 for i, gt_id in enumerate(other_instance_bboxes[:other_instance_bboxes.shape[0],-1]): if gt_id == other_object_id: select = other_instance_bboxes[i].copy() select[-1] = np.max(instance_bboxes) +1 test_instance_labels.fill(np.max(instance_bboxes) +1) instance_bboxes = np.concatenate((instance_bboxes,np.atleast_2d(select)),axis=0) instance_labels = np.concatenate((instance_labels,test_instance_labels),axis=0) flag_add_instance = 1 break if flag_add_instance == 0: print("Warning: Did not add a box from another scene, something wrong") point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] # ------------------------------- LABELS ------------------------------ target_bboxes = np.zeros((MAX_NUM_OBJ, 6)) target_bboxes_mask = np.zeros((MAX_NUM_OBJ)) angle_classes = np.zeros((MAX_NUM_OBJ,)) angle_residuals = np.zeros((MAX_NUM_OBJ,)) size_classes = np.zeros((MAX_NUM_OBJ,)) size_residuals = np.zeros((MAX_NUM_OBJ, 3)) ref_box_label = np.zeros(MAX_NUM_OBJ) # bbox label for reference target ref_center_label = np.zeros(3) # bbox center for reference target ref_heading_class_label = 0 ref_heading_residual_label = 0 ref_size_class_label = 0 ref_size_residual_label = np.zeros(3) # bbox size residual for reference target if self.split != "test": num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ target_bboxes_mask[0:num_bbox] = 1 target_bboxes[0:num_bbox,:] = instance_bboxes[:MAX_NUM_OBJ,0:6] point_votes = np.zeros([self.num_points, 3]) point_votes_mask = np.zeros(self.num_points) # ------------------------------- DATA AUGMENTATION ------------------------------ if self.augment and not self.debug: if np.random.random() > 0.5: # Flipping along the YZ plane point_cloud[:,0] = -1 * point_cloud[:,0] target_bboxes[:,0] = -1 * target_bboxes[:,0] if np.random.random() > 0.5: # Flipping along the XZ plane point_cloud[:,1] = -1 * point_cloud[:,1] target_bboxes[:,1] = -1 * target_bboxes[:,1] # Rotation along X-axis rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate(point_cloud, target_bboxes) # compute votes *AFTER* augmentation # generate votes # Note: since there's no map between bbox instance labels and # pc instance_labels (it had been filtered # in the data preparation step) we'll compute the instance bbox # from the points sharing the same instance label. for i_instance in np.unique(instance_labels): # find all points belong to that instance ind = np.where(instance_labels == i_instance)[0] # find the semantic label if semantic_labels[ind[0]] in DC.nyu40ids: x = point_cloud[ind,:3] center = 0.5*(x.min(0) + x.max(0)) point_votes[ind, :] = center - x point_votes_mask[ind] = 1.0 point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical class_ind = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox,-2]] # NOTE: set size class as semantic class. Consider use size2class. size_classes[0:num_bbox] = class_ind size_residuals[0:num_bbox, :] = target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:] # construct the reference target label for each bbox ref_box_label = np.zeros(MAX_NUM_OBJ) for i, gt_id in enumerate(instance_bboxes[:num_bbox,-1]): if gt_id == object_id: ref_box_label[i] = 1 ref_center_label = target_bboxes[i, 0:3] ref_heading_class_label = angle_classes[i] ref_heading_residual_label = angle_residuals[i] ref_size_class_label = size_classes[i] ref_size_residual_label = size_residuals[i] else: num_bbox = 1 point_votes = np.zeros([self.num_points, 9]) # make 3 votes identical point_votes_mask = np.zeros(self.num_points) target_bboxes_semcls = np.zeros((MAX_NUM_OBJ)) try: target_bboxes_semcls[0:num_bbox] = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:,-2][0:num_bbox]] except KeyError: pass object_cat = self.raw2label[object_name] if object_name in self.raw2label else 17 data_dict = {} data_dict["point_clouds"] = point_cloud.astype(np.float32) # point cloud data including features data_dict["lang_feat"] = lang_feat.astype(np.float32) # language feature vectors data_dict["lang_len"] = np.array(lang_len).astype(np.int64) # length of each description data_dict["center_label"] = target_bboxes.astype(np.float32)[:,0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ data_dict["heading_class_label"] = angle_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1 data_dict["heading_residual_label"] = angle_residuals.astype(np.float32) # (MAX_NUM_OBJ,) data_dict["size_class_label"] = size_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER data_dict["size_residual_label"] = size_residuals.astype(np.float32) # (MAX_NUM_OBJ, 3) data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64) data_dict["sem_cls_label"] = target_bboxes_semcls.astype(np.int64) # (MAX_NUM_OBJ,) semantic class index data_dict["box_label_mask"] = target_bboxes_mask.astype(np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box data_dict["vote_label"] = point_votes.astype(np.float32) data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64) data_dict["scan_idx"] = np.array(idx).astype(np.int64) data_dict["pcl_color"] = pcl_color # data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox data_dict["ref_center_label"] = ref_center_label.astype(np.float32) data_dict["ref_heading_class_label"] = np.array(int(ref_heading_class_label)).astype(np.int64) data_dict["ref_heading_residual_label"] = np.array(int(ref_heading_residual_label)).astype(np.int64) data_dict["ref_size_class_label"] = np.array(int(ref_size_class_label)).astype(np.int64) data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(np.float32) data_dict["object_id"] = np.array(int(object_id)).astype(np.int64) data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64) data_dict["object_cat"] = np.array(object_cat).astype(np.int64) data_dict["unique_multiple"] = np.array(self.unique_multiple_lookup[scene_id][str(object_id)][ann_id]).astype(np.int64) # data_dict["pcl_color"] = pcl_color data_dict["load_time"] = time.time() - start # data_dict["test_point_clouds"] = test_point_cloud.astype(np.float32) # data_dict["test_pcl_color"] = test_pcl_color return data_dict
def __getitem__(self, idx): start = time.time() scene_id = self.scanrefer[idx]["scene_id"] object_id = int(self.scanrefer[idx]["object_id"]) object_name = " ".join(self.scanrefer[idx]["object_name"].split("_")) ann_id = self.scanrefer[idx]["ann_id"] # get language features lang_feat = self.lang[scene_id][str(object_id)][ann_id] lang_len = len(self.scanrefer[idx]["token"]) + 2 lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN + 2 else CONF.TRAIN.MAX_DES_LEN + 2 # get pc mesh_vertices = self.scene_data[scene_id]["mesh_vertices"] instance_labels = self.scene_data[scene_id]["instance_labels"] semantic_labels = self.scene_data[scene_id]["semantic_labels"] instance_bboxes = self.scene_data[scene_id]["instance_bboxes"] if not self.use_color: point_cloud = mesh_vertices[:, 0:3] # do not use color for now pcl_color = mesh_vertices[:, 3:6] else: point_cloud = mesh_vertices[:, 0:6] point_cloud[:, 3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0 pcl_color = point_cloud[:, 3:6] if self.use_normal: normals = mesh_vertices[:, 6:9] point_cloud = np.concatenate([point_cloud, normals], 1) if self.use_multiview: # load multiview database if self.multiview_data == {}: self.multiview_data = h5py.File(MULTIVIEW_DATA, "r", libver="latest") multiview = self.multiview_data[scene_id] point_cloud = np.concatenate([point_cloud, multiview], 1) if self.use_height: floor_height = np.percentile(point_cloud[:, 2], 0.99) height = point_cloud[:, 2] - floor_height point_cloud = np.concatenate( [point_cloud, np.expand_dims(height, 1)], 1) point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] # ------------------------------- LABELS ------------------------------ target_bboxes = np.zeros((MAX_NUM_OBJ, 6)) target_bboxes_mask = np.zeros((MAX_NUM_OBJ)) angle_classes = np.zeros((MAX_NUM_OBJ, )) angle_residuals = np.zeros((MAX_NUM_OBJ, )) size_classes = np.zeros((MAX_NUM_OBJ, )) size_residuals = np.zeros((MAX_NUM_OBJ, 3)) ref_box_label = np.zeros( MAX_NUM_OBJ) # bbox label for reference target ref_center_label = np.zeros(3) # bbox center for reference target ref_heading_class_label = 0 ref_heading_residual_label = 0 ref_size_class_label = 0 ref_size_residual_label = np.zeros( 3) # bbox size residual for reference target ref_box_corner_label = np.zeros((8, 3)) if self.split != "test": num_bbox = instance_bboxes.shape[ 0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ target_bboxes_mask[0:num_bbox] = 1 target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6] point_votes = np.zeros([self.num_points, 3]) point_votes_mask = np.zeros(self.num_points) # ------------------------------- DATA AUGMENTATION ------------------------------ if self.augment: if np.random.random() > 0.5: # Flipping along the YZ plane point_cloud[:, 0] = -1 * point_cloud[:, 0] target_bboxes[:, 0] = -1 * target_bboxes[:, 0] if np.random.random() > 0.5: # Flipping along the XZ plane point_cloud[:, 1] = -1 * point_cloud[:, 1] target_bboxes[:, 1] = -1 * target_bboxes[:, 1] # Rotation along X-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate( point_cloud, target_bboxes) # compute votes *AFTER* augmentation # generate votes # Note: since there's no map between bbox instance labels and # pc instance_labels (it had been filtered # in the data preparation step) we'll compute the instance bbox # from the points sharing the same instance label. for i_instance in np.unique(instance_labels): # find all points belong to that instance ind = np.where(instance_labels == i_instance)[0] # find the semantic label if semantic_labels[ind[0]] in DC.nyu40ids: x = point_cloud[ind, :3] center = 0.5 * (x.min(0) + x.max(0)) point_votes[ind, :] = center - x point_votes_mask[ind] = 1.0 point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical class_ind = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2] ] # NOTE: set size class as semantic class. Consider use size2class. size_classes[0:num_bbox] = class_ind size_residuals[0:num_bbox, :] = target_bboxes[ 0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :] # construct the reference target label for each bbox ref_box_label = np.zeros(MAX_NUM_OBJ) for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]): if gt_id == object_id: ref_box_label[i] = 1 ref_center_label = target_bboxes[i, 0:3] ref_heading_class_label = angle_classes[i] ref_heading_residual_label = angle_residuals[i] ref_size_class_label = size_classes[i] ref_size_residual_label = size_residuals[i] # construct ground truth box corner coordinates ref_obb = DC.param2obb(ref_center_label, ref_heading_class_label, ref_heading_residual_label, ref_size_class_label, ref_size_residual_label) ref_box_corner_label = get_3d_box(ref_obb[3:6], ref_obb[6], ref_obb[0:3]) # construct all GT bbox corners all_obb = DC.param2obb_batch( target_bboxes[:num_bbox, 0:3], angle_classes[:num_bbox].astype(np.int64), angle_residuals[:num_bbox], size_classes[:num_bbox].astype(np.int64), size_residuals[:num_bbox]) all_box_corner_label = get_3d_box_batch(all_obb[:, 3:6], all_obb[:, 6], all_obb[:, 0:3]) # store gt_box_corner_label = np.zeros((MAX_NUM_OBJ, 8, 3)) gt_box_masks = np.zeros((MAX_NUM_OBJ, )) gt_box_object_ids = np.zeros((MAX_NUM_OBJ, )) gt_box_corner_label[:num_bbox] = all_box_corner_label gt_box_masks[:num_bbox] = 1 gt_box_object_ids[:num_bbox] = instance_bboxes[:, -1] else: num_bbox = 1 point_votes = np.zeros([self.num_points, 9]) # make 3 votes identical point_votes_mask = np.zeros(self.num_points) target_bboxes_semcls = np.zeros((MAX_NUM_OBJ)) target_object_ids = np.zeros( (MAX_NUM_OBJ, )) # object ids of all objects try: target_bboxes_semcls[0:num_bbox] = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:, -2][0:num_bbox] ] target_object_ids[0:num_bbox] = instance_bboxes[:, -1][0:num_bbox] except KeyError: pass object_cat = self.raw2label[ object_name] if object_name in self.raw2label else 17 data_dict = {} data_dict["point_clouds"] = point_cloud.astype( np.float32 ) # point cloud data including features [B,max_num_points,3] data_dict["lang_feat"] = lang_feat.astype( np.float32) # language feature vectors [B,32,300] data_dict["lang_len"] = np.array(lang_len).astype( np.int64) # length of each description [B] data_dict["lang_ids"] = np.array( self.lang_ids[scene_id][str(object_id)][ann_id]).astype( np.int64) # [B,32,300] #all data with MAX_NUM_OBJ are mostly filled with zeros data_dict["center_label"] = target_bboxes.astype( np.float32 )[:, 0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ # [B,128,3] data_dict["heading_class_label"] = angle_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1 [B,128] data_dict["heading_residual_label"] = angle_residuals.astype( np.float32) # (MAX_NUM_OBJ,) [B,128] data_dict["size_class_label"] = size_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER [B,128] data_dict["size_residual_label"] = size_residuals.astype( np.float32) # (MAX_NUM_OBJ, 3) [B,128,3] data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64) # [B] data_dict["sem_cls_label"] = target_bboxes_semcls.astype( np.int64) # (MAX_NUM_OBJ,) semantic class index data_dict["scene_object_ids"] = target_object_ids.astype( np.int64) # (MAX_NUM_OBJ,) object ids of all objects data_dict["box_label_mask"] = target_bboxes_mask.astype( np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box data_dict["vote_label"] = point_votes.astype(np.float32) # [B,40000,9] data_dict["vote_label_mask"] = point_votes_mask.astype( np.int64) # [B,40000] data_dict["dataset_idx"] = np.array(idx).astype( np.int64) # [B] object indices from self.scanrefer data_dict["pcl_color"] = pcl_color data_dict["ref_box_label"] = ref_box_label.astype( np.int64) # 0/1 reference labels for each object bbox data_dict["ref_center_label"] = ref_center_label.astype(np.float32) data_dict["ref_heading_class_label"] = np.array( int(ref_heading_class_label)).astype(np.int64) data_dict["ref_heading_residual_label"] = np.array( int(ref_heading_residual_label)).astype(np.int64) data_dict["ref_size_class_label"] = np.array( int(ref_size_class_label)).astype(np.int64) data_dict["ref_size_residual_label"] = ref_size_residual_label.astype( np.float32) data_dict["ref_box_corner_label"] = ref_box_corner_label.astype( np.float64) # target box corners NOTE type must be data_dict["gt_box_corner_label"] = gt_box_corner_label.astype( np.float64) # all GT box corners NOTE type must be double data_dict["gt_box_masks"] = gt_box_masks.astype( np.int64) # valid bbox masks data_dict["gt_box_object_ids"] = gt_box_object_ids.astype( np.int64) # valid bbox object ids data_dict["object_id"] = np.array(int(object_id)).astype( np.int64) # [B] target object_ids data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64) # [B] data_dict["object_cat"] = np.array(object_cat).astype( np.int64) # [B] target object classes data_dict["unique_multiple"] = np.array( self.unique_multiple_lookup[scene_id][str( object_id)][ann_id]).astype(np.int64) data_dict["pcl_color"] = pcl_color # [B,40000,3] data_dict["load_time"] = time.time() - start return data_dict
def __getitem__(self, idx): scene_id = self.scanrefer[idx]["scene_id"] object_id = int(self.scanrefer[idx]["object_id"]) object_name = " ".join(self.scanrefer[idx]["object_name"].split("_")) ann_id = int(self.scanrefer[idx]["ann_id"]) object_cat = self.raw2label[ object_name] if object_name in self.raw2label else 17 # tokenize the description tokens = self.scanrefer[idx]["token"] embeddings = np.zeros((CONF.TRAIN.MAX_DES_LEN, 300)) for token_id in range(CONF.TRAIN.MAX_DES_LEN): if token_id < len(tokens): token = tokens[token_id] if token.isspace(): continue if token in self.glove: embeddings[token_id] = self.glove[token] else: embeddings[token_id] = self.glove["unk"] else: break # get language features lang_feat = embeddings lang_token = tokens lang_len = len([token for token in lang_token if not token.isspace()]) lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN # get pc mesh_vertices = np.load( os.path.join(CONF.PATH.SCANNET_DATA, scene_id) + "_aligned_vert.npy") # axis-aligned instance_labels = np.load( os.path.join(CONF.PATH.SCANNET_DATA, scene_id) + "_ins_label_pg.npy") semantic_labels = np.load( os.path.join(CONF.PATH.SCANNET_DATA, scene_id) + "_sem_label_pg.npy") instance_bboxes = np.load( os.path.join(CONF.PATH.SCANNET_DATA, scene_id) + "_aligned_bbox.npy") if not self.use_color: point_cloud = mesh_vertices[:, 0:3] # do not use color for now pcl_color = mesh_vertices[:, 3:6] else: point_cloud = mesh_vertices[:, 0:6] point_cloud[:, 3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0 pcl_color = point_cloud[:, 3:6] if self.use_normal: normals = mesh_vertices[:, 6:9] point_cloud = np.concatenate([point_cloud, normals], 1) if self.use_multiview: # load multiview database if not hasattr(self, 'multiview_data'): self.multiview_data = h5py.File(MULTIVIEW_DATA, "r", libver="latest", swmr=True) multiview = np.array(self.multiview_data[scene_id]) point_cloud = np.concatenate([point_cloud, multiview], 1) if self.use_height: floor_height = np.percentile(point_cloud[:, 2], 0.99) height = point_cloud[:, 2] - floor_height point_cloud = np.concatenate( [point_cloud, np.expand_dims(height, 1)], 1) point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] # ------------------------------- LABELS ------------------------------ target_bboxes = np.zeros((MAX_NUM_OBJ, 6)) target_bboxes_mask = np.zeros((MAX_NUM_OBJ)) angle_classes = np.zeros((MAX_NUM_OBJ, )) angle_residuals = np.zeros((MAX_NUM_OBJ, )) size_classes = np.zeros((MAX_NUM_OBJ, )) size_residuals = np.zeros((MAX_NUM_OBJ, 3)) ref_box_label = np.zeros( MAX_NUM_OBJ) # bbox label for reference target ref_center_label = np.zeros(3) # bbox center for reference target ref_heading_class_label = 0 ref_heading_residual_label = 0 ref_size_class_label = 0 ref_size_residual_label = np.zeros( 3) # bbox size residual for reference target scene_points = np.zeros((1, 10)) if self.split != "test": num_bbox = instance_bboxes.shape[ 0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ target_bboxes_mask[0:num_bbox] = 1 target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6] # ------------------------------- DATA AUGMENTATION ------------------------------ if self.augment: if torch.rand(1).item() > 0.5: # Flipping along the YZ plane point_cloud[:, 0] = -1 * point_cloud[:, 0] target_bboxes[:, 0] = -1 * target_bboxes[:, 0] if torch.rand(1).item() > 0.5: # Flipping along the XZ plane point_cloud[:, 1] = -1 * point_cloud[:, 1] target_bboxes[:, 1] = -1 * target_bboxes[:, 1] # Rotation along X-axis rot_angle = (torch.rand(1).item() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (torch.rand(1).item() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (torch.rand(1).item() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate( point_cloud, target_bboxes) # NOTE: set size class as semantic class. Consider use size2class. class_ind = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2] ] size_classes[0:num_bbox] = class_ind size_residuals[0:num_bbox, :] = target_bboxes[ 0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :] # construct the reference target label for each bbox ref_box_label = np.zeros(MAX_NUM_OBJ) for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]): if gt_id == object_id: ref_box_label[i] = 1 ref_center_label = target_bboxes[i, 0:3] ref_heading_class_label = angle_classes[i] ref_heading_residual_label = angle_residuals[i] ref_size_class_label = size_classes[i] ref_size_residual_label = size_residuals[i] else: num_bbox = 1 instance_points = [] instance_class = [] ref_target = [] ins_obbs = [] pts_batch = [] pred_obbs = [] for i_instance in np.unique(instance_labels): # find all points belong to that instance ind = np.nonzero(instance_labels == i_instance)[0] # find the semantic label ins_class = semantic_labels[ind[0]] if ins_class in DC.nyu40ids: x = point_cloud[ind] ins_class = DC.nyu40id2class[int(ins_class)] instance_class.append(ins_class) pc = x[:, :3] center = 0.5 * (pc.min(0) + pc.max(0)) size = pc.max(0) - pc.min(0) ins_obb = np.concatenate((center, size, np.array([0]))) ins_obbs.append(ins_obb) x = random_sampling(x, 1024) instance_points.append(x) if ins_class == object_cat: pc = x[:, :3] coords, feats = sparse_quantize( pc, x, quantization_size=self.voxel_size_ap) pt_inst = SparseTensor(feats, coords) if len(ins_obb) < 2: continue pred_obbs.append(ins_obb) pts_batch.append(pt_inst) if i_instance == (object_id + 1): ref_target.append(1) else: ref_target.append(0) else: scene_points = point_cloud[ind] target_bboxes_semcls = np.zeros((MAX_NUM_OBJ)) try: target_bboxes_semcls[0:num_bbox] = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:, -2][0:num_bbox] ] except KeyError: pass pc = point_cloud[:, :3] coords, feats = sparse_quantize(pc, point_cloud, quantization_size=self.voxel_size_glp) pt = SparseTensor(feats, coords) data_dict = {} data_dict['lidar'] = pt data_dict['pts_batch'] = pts_batch data_dict['pred_obb_batch'] = pred_obbs data_dict['scene_points'] = [scene_points] data_dict['point_min'] = point_cloud.min(0)[:3] data_dict['point_max'] = point_cloud.max(0)[:3] data_dict['instance_labels'] = instance_labels.astype(np.int64) data_dict['instance_points'] = instance_points data_dict['instance_class'] = instance_class data_dict['instance_obbs'] = ins_obbs data_dict["point_clouds"] = point_cloud.astype( np.float32) # point cloud data including features data_dict["lang_feat"] = lang_feat.astype( np.float32) # language feature vectors data_dict["lang_token"] = lang_token data_dict["lang_len"] = np.array(lang_len).astype( np.int64) # length of each description data_dict["center_label"] = target_bboxes.astype( np.float32)[:, 0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ data_dict["heading_class_label"] = angle_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1 data_dict["heading_residual_label"] = angle_residuals.astype( np.float32) # (MAX_NUM_OBJ,) data_dict["size_class_label"] = size_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER data_dict["size_residual_label"] = size_residuals.astype( np.float32) # (MAX_NUM_OBJ, 3) data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64) data_dict["scan_idx"] = np.array(idx).astype(np.int64) data_dict["pcl_color"] = pcl_color data_dict["ref_box_label"] = ref_box_label.astype( np.int64) # 0/1 reference labels for each object bbox data_dict["ref_center_label"] = ref_center_label.astype(np.float32) data_dict["ref_heading_class_label"] = np.array( int(ref_heading_class_label)).astype(np.int64) data_dict["ref_heading_residual_label"] = np.array( int(ref_heading_residual_label)).astype(np.int64) data_dict["ref_size_class_label"] = np.array( int(ref_size_class_label)).astype(np.int64) data_dict["ref_size_residual_label"] = ref_size_residual_label.astype( np.float32) data_dict["object_id"] = np.array(int(object_id)).astype(np.int64) data_dict["ann_id"] = np.array(ann_id).astype(np.int64) data_dict["object_cat"] = np.array(object_cat).astype(np.int64) data_dict["unique_multiple"] = np.array( self.unique_multiple_lookup[scene_id][str(object_id)][str( ann_id)]).astype(np.int64) return data_dict
def _preprocess_sample(self, data): ### Get the data information in annotation item. scene_id = data['scene_id'] object_id = int(data['object_id']) object_name = " ".join(data["object_name"].split("_")) ann_id = int(data["ann_id"]) ### Get the referring expression description = data["indexed_token"] if len(description) > self.max_len: description = description[:self.max_len] else: description = description + [self.pad_token ] * (self.max_len - len(description)) description = [self.sos_token] + description + [self.eos_token] original_description = data['description'] ### Get the original annotation data. mesh_vertices = self.scene_data[scene_id]["mesh_vertices"] instance_labels = self.scene_data[scene_id]["instance_labels"] semantic_labels = self.scene_data[scene_id]["semantic_labels"] instance_bboxes = self.scene_data[scene_id]["instance_bboxes"] ### Get point cloud data if not self.use_color: point_cloud = mesh_vertices[:, 0:3] # do not use color for now pcl_color = mesh_vertices[:, 3:6] else: point_cloud = mesh_vertices[:, 0:6] # Point cloud centering point_cloud[:, :3] = point_cloud[:, :3] - point_cloud[:, :3].mean( axis=0, keepdims=True) # Point cloud RGB scaling point_cloud[:, 3:] = (point_cloud[:, 3:] - self.cfg.TRAINING.MEAN_COLOR_RGB) / 255.0 # point_cloud[:,3:] = point_cloud[:,3:] * 2.7 / 255.0 pcl_color = point_cloud[:, 3:] ### Sampling points point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] ### Specify the number of box we need to predict and create a mask for it. # num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < self.max_num_obj else self.max_num_obj # target_bboxes_mask = np.zeros((self.max_num_obj)) # target_bboxes_mask[0:num_bbox] = 1 # target_bboxes = instance_bboxes[:num_bbox, 0:6] target_bboxes = instance_bboxes ### Data augmentation (*Warning: after augmenting, target_boxes will be left only 6 element dimension) if self.augment and not self.debug: if np.random.random() > 0.5: # Flipping along the YZ plane point_cloud[:, 0] = -1 * point_cloud[:, 0] target_bboxes[:, 0] = -1 * target_bboxes[:, 0] if np.random.random() > 0.5: # Flipping along the XZ plane point_cloud[:, 1] = -1 * point_cloud[:, 1] target_bboxes[:, 1] = -1 * target_bboxes[:, 1] # Rotation along X-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate( point_cloud, target_bboxes) target_bboxes = np.concatenate([target_bboxes, instance_bboxes], axis=1) ### Build up referred targets' labels sample = {} for idx, bbox in enumerate(target_bboxes): if int(bbox[-1]) == object_id: gt_instance_id = bbox[-1] gt_semantic_id = bbox[-2] x_min = (2 * bbox[0] - bbox[3]) / 2 x_max = (2 * bbox[0] + bbox[3]) / 2 y_min = (2 * bbox[1] - bbox[4]) / 2 y_max = (2 * bbox[1] + bbox[4]) / 2 z_min = (2 * bbox[2] - bbox[5]) / 2 z_max = (2 * bbox[2] + bbox[5]) / 2 sample['point_cloud'] = point_cloud sample['object_name'] = object_name sample['corners'] = np.array( [x_min, y_min, z_min, x_max, y_max, z_max]).astype(np.float32) sample['class_id'] = float(bbox[-2]) instance_seg = np.zeros_like(instance_labels) instance_seg[instance_labels == gt_instance_id] = 1 sample['instance_seg'] = instance_seg.astype(np.float32) sample['description'] = np.array(description).astype( np.float32) sample['original_description'] = original_description return sample
def __getitem__(self, idx): start = time.time() scene_id = self.scanrefer[idx]["scene_id"] object_id = int(self.scanrefer[idx]["object_id"]) object_name = " ".join(self.scanrefer[idx]["object_name"].split("_")) ann_id = self.scanrefer[idx]["ann_id"] # get language features lang_feat = self.lang[scene_id][str(object_id)][ann_id] lang_len = len(self.scanrefer[idx]["token"]) lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN # get pc mesh_vertices = self.scene_data[scene_id]["mesh_vertices"] instance_labels = self.scene_data[scene_id]["instance_labels"] semantic_labels = self.scene_data[scene_id]["semantic_labels"] instance_bboxes = self.scene_data[scene_id]["instance_bboxes"] if not self.use_color: point_cloud = mesh_vertices[:, 0:3] # do not use color for now pcl_color = mesh_vertices[:, 3:6] else: point_cloud = mesh_vertices[:, 0:6] point_cloud[:, 3:] = (point_cloud[:, 3:] - MEAN_COLOR_RGB) / 256.0 pcl_color = point_cloud[:, 3:] if self.use_normal: normals = mesh_vertices[:, 6:9] point_cloud = np.concatenate([point_cloud, normals], 1) if self.use_multiview: # load multiview database pid = mp.current_process().pid if pid not in self.multiview_data: self.multiview_data[pid] = h5py.File(MULTIVIEW_DATA, "r", libver="latest") multiview = self.multiview_data[pid][scene_id] point_cloud = np.concatenate([point_cloud, multiview], 1) if self.use_height: floor_height = np.percentile(point_cloud[:, 2], 0.99) height = point_cloud[:, 2] - floor_height point_cloud = np.concatenate( [point_cloud, np.expand_dims(height, 1)], 1) # ------------------------------- LABELS ------------------------------ target_bboxes = np.zeros((MAX_NUM_OBJ, 6)) target_bboxes_mask = np.zeros((MAX_NUM_OBJ)) angle_classes = np.zeros((MAX_NUM_OBJ, )) angle_residuals = np.zeros((MAX_NUM_OBJ, )) size_classes = np.zeros((MAX_NUM_OBJ, )) size_residuals = np.zeros((MAX_NUM_OBJ, 3)) ref_box_label = np.zeros( MAX_NUM_OBJ) # bbox label for reference target point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True) instance_labels = instance_labels[choices] semantic_labels = semantic_labels[choices] pcl_color = pcl_color[choices] num_bbox = instance_bboxes.shape[ 0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ target_bboxes_mask[0:num_bbox] = 1 target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6] # ------------------------------- DATA AUGMENTATION ------------------------------ if self.augment and not self.debug: if np.random.random() > 0.5: # Flipping along the YZ plane point_cloud[:, 0] = -1 * point_cloud[:, 0] target_bboxes[:, 0] = -1 * target_bboxes[:, 0] if np.random.random() > 0.5: # Flipping along the XZ plane point_cloud[:, 1] = -1 * point_cloud[:, 1] target_bboxes[:, 1] = -1 * target_bboxes[:, 1] # Rotation along X-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotx(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "x") # Rotation along Y-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = roty(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "y") # Rotation along up-axis/Z-axis rot_angle = (np.random.random() * np.pi / 18) - np.pi / 36 # -5 ~ +5 degree rot_mat = rotz(rot_angle) point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3], np.transpose(rot_mat)) target_bboxes = rotate_aligned_boxes_along_axis( target_bboxes, rot_mat, "z") # Translation point_cloud, target_bboxes = self._translate( point_cloud, target_bboxes) # compute votes *AFTER* augmentation # generate votes # Note: since there's no map between bbox instance labels and # pc instance_labels (it had been filtered # in the data preparation step) we'll compute the instance bbox # from the points sharing the same instance label. point_votes = np.zeros([self.num_points, 3]) point_votes_mask = np.zeros(self.num_points) for i_instance in np.unique(instance_labels): # find all points belong to that instance ind = np.where(instance_labels == i_instance)[0] # find the semantic label if semantic_labels[ind[0]] in DC.nyu40ids: x = point_cloud[ind, :3] center = 0.5 * (x.min(0) + x.max(0)) point_votes[ind, :] = center - x point_votes_mask[ind] = 1.0 point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical class_ind = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2] ] # NOTE: set size class as semantic class. Consider use size2class. size_classes[0:num_bbox] = class_ind size_residuals[0:num_bbox, :] = \ target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:] # construct the reference target label for each bbox ref_box_label = np.zeros(MAX_NUM_OBJ) for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]): if gt_id == object_id: ref_box_label[i] = 1 ref_center_label = target_bboxes[i, 0:3] ref_heading_class_label = angle_classes[i] ref_heading_residual_label = angle_residuals[i] ref_size_class_label = size_classes[i] ref_size_residual_label = size_residuals[i] data_dict = {} data_dict["point_clouds"] = point_cloud.astype( np.float32) # point cloud data including features data_dict["lang_feat"] = lang_feat.astype( np.float32) # language feature vectors data_dict["lang_len"] = np.array(lang_len).astype( np.int64) # length of each description data_dict["center_label"] = target_bboxes.astype( np.float32)[:, 0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ data_dict["heading_class_label"] = angle_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1 data_dict["heading_residual_label"] = angle_residuals.astype( np.float32) # (MAX_NUM_OBJ,) data_dict["size_class_label"] = size_classes.astype( np.int64 ) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER data_dict["size_residual_label"] = size_residuals.astype( np.float32) # (MAX_NUM_OBJ, 3) target_bboxes_semcls = np.zeros((MAX_NUM_OBJ)) target_bboxes_semcls[0:num_bbox] = [ DC.nyu40id2class[int(x)] for x in instance_bboxes[:, -2][0:num_bbox] ] data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64) data_dict["sem_cls_label"] = target_bboxes_semcls.astype( np.int64) # (MAX_NUM_OBJ,) semantic class index data_dict["box_label_mask"] = target_bboxes_mask.astype( np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box data_dict["vote_label"] = point_votes.astype(np.float32) data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64) data_dict["scan_idx"] = np.array(idx).astype(np.int64) data_dict["pcl_color"] = pcl_color data_dict["ref_box_label"] = ref_box_label.astype( np.int64) # 0/1 reference labels for each object bbox data_dict["ref_box_label"] = ref_box_label.astype( np.int64) # 0/1 reference labels for each object bbox data_dict["ref_center_label"] = ref_center_label.astype(np.float32) data_dict["ref_heading_class_label"] = np.array( int(ref_heading_class_label)).astype(np.int64) data_dict["ref_heading_residual_label"] = np.array( int(ref_heading_residual_label)).astype(np.int64) data_dict["ref_size_class_label"] = np.array( int(ref_size_class_label)).astype(np.int64) data_dict["ref_size_residual_label"] = ref_size_residual_label.astype( np.float32) data_dict["object_id"] = np.array(int(object_id)).astype(np.int64) data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64) data_dict["object_cat"] = np.array(self.raw2label[object_name]).astype( np.int64) data_dict["pcl_color"] = pcl_color data_dict["load_time"] = time.time() - start return data_dict