Esempio n. 1
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]
        
        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        point_cloud,pcl_color = self.process_pc(mesh_vertices,scene_id)
        
        if self.cp_aug and self.split != 'test':
            # Choose examples from other scene
            num_obj_add =  32 - instance_bboxes.shape[0]
            for i in range(num_obj_add):
                idx_other = random.randint(0,len(self.scanrefer)-1)
                while idx_other== idx:
                    idx_other = random.randint(0,len(self.scanrefer)-1)
                try:
                    other_scene_id = self.scanrefer[idx_other]["scene_id"]
                    other_object_id = int(self.scanrefer[idx_other]["object_id"])
                    other_object_name = " ".join(self.scanrefer[idx_other]["object_name"].split("_"))
                    other_ann_id = self.scanrefer[idx_other]["ann_id"]
                except IndexError:
                    print("Index Error: Selecting an index out of range")

                # get pc
                other_mesh_vertices = self.scene_data[other_scene_id]["mesh_vertices"]
                other_instance_labels = self.scene_data[other_scene_id]["instance_labels"]
                other_semantic_labels = self.scene_data[other_scene_id]["semantic_labels"]
                other_instance_bboxes = self.scene_data[other_scene_id]["instance_bboxes"]
                other_point_cloud,other_pcl_color = self.process_pc(other_mesh_vertices,scene_id)

                jitter_idx = 1#random.random()*0.45 +0.8 # Standard scale jittering

                # Random pick object and append to the current scene
                target_obj_label = random.randint(0,np.max(other_instance_labels)) # Find object based on id
                test_instance_labels, test_choices, flag_exceed = choose_label_pc(other_instance_labels, target_obj_label, 200, return_choices=True)
                test_instance_labels = np.empty(test_instance_labels.shape)
                
                other_point_cloud_jitter = other_point_cloud[test_choices].copy()
                # other_point_cloud_jitter[:,0:3] *= jitter_idx
                point_cloud     = np.concatenate((point_cloud,other_point_cloud_jitter),axis=0) 
                semantic_labels = np.concatenate((semantic_labels,other_semantic_labels[test_choices]),axis=0)
                pcl_color       = np.concatenate((pcl_color,other_pcl_color[test_choices]),axis=0)

                # Find the right box in other scene
                flag_add_instance = 0
                for i, gt_id in enumerate(other_instance_bboxes[:other_instance_bboxes.shape[0],-1]):
                    if gt_id == other_object_id:
                        select = other_instance_bboxes[i].copy()
                        select[-1] = np.max(instance_bboxes) +1
                        test_instance_labels.fill(np.max(instance_bboxes) +1)
                        instance_bboxes = np.concatenate((instance_bboxes,np.atleast_2d(select)),axis=0)
                        instance_labels = np.concatenate((instance_labels,test_instance_labels),axis=0)
                        flag_add_instance = 1
                        break
                if flag_add_instance == 0:
                    print("Warning: Did not add a box from another scene, something wrong")

        point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        
        
        # ------------------------------- LABELS ------------------------------    
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))    
        angle_classes = np.zeros((MAX_NUM_OBJ,))
        angle_residuals = np.zeros((MAX_NUM_OBJ,))
        size_classes = np.zeros((MAX_NUM_OBJ,))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(MAX_NUM_OBJ) # bbox label for reference target
        ref_center_label = np.zeros(3) # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(3) # bbox size residual for reference target

        if self.split != "test":
            num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox,:] = instance_bboxes[:MAX_NUM_OBJ,0:6]

            point_votes = np.zeros([self.num_points, 3])
            point_votes_mask = np.zeros(self.num_points)

            # ------------------------------- DATA AUGMENTATION ------------------------------        
            if self.augment and not self.debug:
                if np.random.random() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:,0] = -1 * point_cloud[:,0]
                    target_bboxes[:,0] = -1 * target_bboxes[:,0]                
                    
                if np.random.random() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:,1] = -1 * point_cloud[:,1]
                    target_bboxes[:,1] = -1 * target_bboxes[:,1]                                

                # Rotation along X-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(point_cloud, target_bboxes)

            # compute votes *AFTER* augmentation
            # generate votes
            # Note: since there's no map between bbox instance labels and
            # pc instance_labels (it had been filtered 
            # in the data preparation step) we'll compute the instance bbox
            # from the points sharing the same instance label. 
            for i_instance in np.unique(instance_labels):            
                # find all points belong to that instance
                ind = np.where(instance_labels == i_instance)[0]
                # find the semantic label            
                if semantic_labels[ind[0]] in DC.nyu40ids:
                    x = point_cloud[ind,:3]
                    center = 0.5*(x.min(0) + x.max(0))
                    point_votes[ind, :] = center - x
                    point_votes_mask[ind] = 1.0
            point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical 
            
            class_ind = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox,-2]]
            # NOTE: set size class as semantic class. Consider use size2class.
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox,-1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]
        else:
            num_bbox = 1
            point_votes = np.zeros([self.num_points, 9]) # make 3 votes identical 
            point_votes_mask = np.zeros(self.num_points)

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        try:
            target_bboxes_semcls[0:num_bbox] = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:,-2][0:num_bbox]]
        except KeyError:
            pass

        object_cat = self.raw2label[object_name] if object_name in self.raw2label else 17

        data_dict = {}
        data_dict["point_clouds"] = point_cloud.astype(np.float32) # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(np.float32) # language feature vectors
        data_dict["lang_len"] = np.array(lang_len).astype(np.int64) # length of each description
        data_dict["center_label"] = target_bboxes.astype(np.float32)[:,0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(np.float32) # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(np.float32) # (MAX_NUM_OBJ, 3)
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(np.int64) # (MAX_NUM_OBJ,) semantic class index
        data_dict["box_label_mask"] = target_bboxes_mask.astype(np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)
        data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        # data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox
        data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)
        data_dict["object_cat"] = np.array(object_cat).astype(np.int64)
        data_dict["unique_multiple"] = np.array(self.unique_multiple_lookup[scene_id][str(object_id)][ann_id]).astype(np.int64)
        # data_dict["pcl_color"] = pcl_color
        data_dict["load_time"] = time.time() - start

        # data_dict["test_point_clouds"] = test_point_cloud.astype(np.float32)
        # data_dict["test_pcl_color"] = test_pcl_color

        return data_dict
Esempio n. 2
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]

        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:, 3:] = (point_cloud[:, 3:] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            pid = mp.current_process().pid
            if pid not in self.multiview_data:
                self.multiview_data[pid] = h5py.File(MULTIVIEW_DATA,
                                                     "r",
                                                     libver="latest")

            multiview = self.multiview_data[pid][scene_id]
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        num_bbox = instance_bboxes.shape[
            0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
        target_bboxes_mask[0:num_bbox] = 1
        target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

        # ------------------------------- DATA AUGMENTATION ------------------------------
        # if self.augment and not self.debug:
        #     if np.random.random() > 0.5:
        #         # Flipping along the YZ plane
        #         point_cloud[:,0] = -1 * point_cloud[:,0]
        #         target_bboxes[:,0] = -1 * target_bboxes[:,0]
        #
        #     if np.random.random() > 0.5:
        #         # Flipping along the XZ plane
        #         point_cloud[:,1] = -1 * point_cloud[:,1]
        #         target_bboxes[:,1] = -1 * target_bboxes[:,1]
        #
        #     # Rotation along X-axis
        #     rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
        #     rot_mat = rotx(rot_angle)
        #     point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
        #     target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "x")
        #
        #     # Rotation along Y-axis
        #     rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
        #     rot_mat = roty(rot_angle)
        #     point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
        #     target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "y")
        #
        #     # Rotation along up-axis/Z-axis
        #     rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
        #     rot_mat = rotz(rot_angle)
        #     point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
        #     target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "z")
        #
        #     # Translation
        #     point_cloud, target_bboxes = self._translate(point_cloud, target_bboxes)

        # compute votes *AFTER* augmentation
        # generate votes
        # Note: since there's no map between bbox instance labels and
        # pc instance_labels (it had been filtered
        # in the data preparation step) we'll compute the instance bbox
        # from the points sharing the same instance label.
        point_votes = np.zeros([self.num_points, 3])
        point_votes_mask = np.zeros(self.num_points)
        for i_instance in np.unique(instance_labels):
            # find all points belong to that instance
            ind = np.where(instance_labels == i_instance)[0]
            # find the semantic label
            if semantic_labels[ind[0]] in DC.nyu40ids:
                x = point_cloud[ind, :3]
                center = 0.5 * (x.min(0) + x.max(0))
                point_votes[ind, :] = center - x
                point_votes_mask[ind] = 1.0
        point_votes = np.tile(point_votes, (1, 3))  # make 3 votes identical

        class_ind = [
            DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2]
        ]
        # NOTE: set size class as semantic class. Consider use size2class.
        size_classes[0:num_bbox] = class_ind
        size_residuals[0:num_bbox, :] = \
            target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:]

        # construct the reference target label for each bbox
        ref_box_label = np.zeros(MAX_NUM_OBJ)
        for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
            if gt_id == object_id:
                ref_box_label[i] = 1
                ref_center_label = target_bboxes[i, 0:3]
                ref_heading_class_label = angle_classes[i]
                ref_heading_residual_label = angle_residuals[i]
                ref_size_class_label = size_classes[i]
                ref_size_residual_label = size_residuals[i]

        data_dict = {}
        data_dict['scan_name'] = scene_id
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32)  # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description
        data_dict["center_label"] = target_bboxes.astype(
            np.float32)[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3)
        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        target_bboxes_semcls[0:num_bbox] = [
            DC.nyu40id2class[int(x)]
            for x in instance_bboxes[:, -2][0:num_bbox]
        ]
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(
            np.int64)  # (MAX_NUM_OBJ,) semantic class index
        data_dict["box_label_mask"] = target_bboxes_mask.astype(
            np.float32)  # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)
        data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)
        data_dict["object_cat"] = np.array(self.raw2label[object_name]).astype(
            np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["load_time"] = time.time() - start

        return data_dict
Esempio n. 3
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]

        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"]) + 2
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN + 2 else CONF.TRAIN.MAX_DES_LEN + 2

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:,
                        3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:6]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            if self.multiview_data == {}:
                self.multiview_data = h5py.File(MULTIVIEW_DATA,
                                                "r",
                                                libver="latest")

            multiview = self.multiview_data[scene_id]
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))

        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target
        ref_center_label = np.zeros(3)  # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(
            3)  # bbox size residual for reference target
        ref_box_corner_label = np.zeros((8, 3))

        if self.split != "test":
            num_bbox = instance_bboxes.shape[
                0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

            point_votes = np.zeros([self.num_points, 3])
            point_votes_mask = np.zeros(self.num_points)

            # ------------------------------- DATA AUGMENTATION ------------------------------
            if self.augment:
                if np.random.random() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:, 0] = -1 * point_cloud[:, 0]
                    target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

                if np.random.random() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:, 1] = -1 * point_cloud[:, 1]
                    target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

                # Rotation along X-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(
                    point_cloud, target_bboxes)

            # compute votes *AFTER* augmentation
            # generate votes
            # Note: since there's no map between bbox instance labels and
            # pc instance_labels (it had been filtered
            # in the data preparation step) we'll compute the instance bbox
            # from the points sharing the same instance label.
            for i_instance in np.unique(instance_labels):
                # find all points belong to that instance
                ind = np.where(instance_labels == i_instance)[0]
                # find the semantic label
                if semantic_labels[ind[0]] in DC.nyu40ids:
                    x = point_cloud[ind, :3]
                    center = 0.5 * (x.min(0) + x.max(0))
                    point_votes[ind, :] = center - x
                    point_votes_mask[ind] = 1.0
            point_votes = np.tile(point_votes,
                                  (1, 3))  # make 3 votes identical

            class_ind = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:num_bbox, -2]
            ]
            # NOTE: set size class as semantic class. Consider use size2class.
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[
                0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]

                    # construct ground truth box corner coordinates
                    ref_obb = DC.param2obb(ref_center_label,
                                           ref_heading_class_label,
                                           ref_heading_residual_label,
                                           ref_size_class_label,
                                           ref_size_residual_label)
                    ref_box_corner_label = get_3d_box(ref_obb[3:6], ref_obb[6],
                                                      ref_obb[0:3])

            # construct all GT bbox corners
            all_obb = DC.param2obb_batch(
                target_bboxes[:num_bbox,
                              0:3], angle_classes[:num_bbox].astype(np.int64),
                angle_residuals[:num_bbox],
                size_classes[:num_bbox].astype(np.int64),
                size_residuals[:num_bbox])
            all_box_corner_label = get_3d_box_batch(all_obb[:, 3:6],
                                                    all_obb[:, 6],
                                                    all_obb[:, 0:3])

            # store
            gt_box_corner_label = np.zeros((MAX_NUM_OBJ, 8, 3))
            gt_box_masks = np.zeros((MAX_NUM_OBJ, ))
            gt_box_object_ids = np.zeros((MAX_NUM_OBJ, ))

            gt_box_corner_label[:num_bbox] = all_box_corner_label
            gt_box_masks[:num_bbox] = 1
            gt_box_object_ids[:num_bbox] = instance_bboxes[:, -1]
        else:
            num_bbox = 1
            point_votes = np.zeros([self.num_points,
                                    9])  # make 3 votes identical
            point_votes_mask = np.zeros(self.num_points)

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        target_object_ids = np.zeros(
            (MAX_NUM_OBJ, ))  # object ids of all objects
        try:
            target_bboxes_semcls[0:num_bbox] = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:, -2][0:num_bbox]
            ]
            target_object_ids[0:num_bbox] = instance_bboxes[:, -1][0:num_bbox]
        except KeyError:
            pass

        object_cat = self.raw2label[
            object_name] if object_name in self.raw2label else 17

        data_dict = {}
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32
        )  # point cloud data including features    [B,max_num_points,3]
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors     [B,32,300]
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description    [B]
        data_dict["lang_ids"] = np.array(
            self.lang_ids[scene_id][str(object_id)][ann_id]).astype(
                np.int64)  #     [B,32,300]
        #all data with MAX_NUM_OBJ are mostly filled with zeros
        data_dict["center_label"] = target_bboxes.astype(
            np.float32
        )[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ  # [B,128,3]
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1  [B,128]
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,) [B,128]
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER  [B,128]
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3) [B,128,3]
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)  # [B]
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(
            np.int64)  # (MAX_NUM_OBJ,) semantic class index
        data_dict["scene_object_ids"] = target_object_ids.astype(
            np.int64)  # (MAX_NUM_OBJ,) object ids of all objects
        data_dict["box_label_mask"] = target_bboxes_mask.astype(
            np.float32)  # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)  # [B,40000,9]
        data_dict["vote_label_mask"] = point_votes_mask.astype(
            np.int64)  # [B,40000]
        data_dict["dataset_idx"] = np.array(idx).astype(
            np.int64)  # [B] object indices from self.scanrefer
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["ref_box_corner_label"] = ref_box_corner_label.astype(
            np.float64)  # target box corners NOTE type must be
        data_dict["gt_box_corner_label"] = gt_box_corner_label.astype(
            np.float64)  # all GT box corners NOTE type must be double
        data_dict["gt_box_masks"] = gt_box_masks.astype(
            np.int64)  # valid bbox masks
        data_dict["gt_box_object_ids"] = gt_box_object_ids.astype(
            np.int64)  # valid bbox object ids
        data_dict["object_id"] = np.array(int(object_id)).astype(
            np.int64)  # [B] target object_ids
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)  # [B]
        data_dict["object_cat"] = np.array(object_cat).astype(
            np.int64)  # [B] target object classes
        data_dict["unique_multiple"] = np.array(
            self.unique_multiple_lookup[scene_id][str(
                object_id)][ann_id]).astype(np.int64)
        data_dict["pcl_color"] = pcl_color  # [B,40000,3]
        data_dict["load_time"] = time.time() - start

        return data_dict
Esempio n. 4
0
    def _preprocess_sample(self, data):

        ### Get the data information in annotation item.
        scene_id = data['scene_id']
        object_id = int(data['object_id'])
        object_name = " ".join(data["object_name"].split("_"))
        ann_id = int(data["ann_id"])

        ### Get the referring expression
        description = data["indexed_token"]
        if len(description) > self.max_len:
            description = description[:self.max_len]
        else:
            description = description + [self.pad_token
                                         ] * (self.max_len - len(description))
        description = [self.sos_token] + description + [self.eos_token]

        original_description = data['description']

        ### Get the original annotation data.
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        ### Get point cloud data
        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]

            # Point cloud centering
            point_cloud[:, :3] = point_cloud[:, :3] - point_cloud[:, :3].mean(
                axis=0, keepdims=True)

            # Point cloud RGB scaling
            point_cloud[:, 3:] = (point_cloud[:, 3:] -
                                  self.cfg.TRAINING.MEAN_COLOR_RGB) / 255.0
            # point_cloud[:,3:] = point_cloud[:,3:] * 2.7 / 255.0

            pcl_color = point_cloud[:, 3:]

        ### Sampling points
        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        ### Specify the number of box we need to predict and create a mask for it.
        # num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < self.max_num_obj else self.max_num_obj
        # target_bboxes_mask = np.zeros((self.max_num_obj))
        # target_bboxes_mask[0:num_bbox] = 1
        # target_bboxes = instance_bboxes[:num_bbox, 0:6]
        target_bboxes = instance_bboxes

        ### Data augmentation (*Warning: after augmenting, target_boxes will be left only 6 element dimension)
        if self.augment and not self.debug:
            if np.random.random() > 0.5:
                # Flipping along the YZ plane
                point_cloud[:, 0] = -1 * point_cloud[:, 0]
                target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

            if np.random.random() > 0.5:
                # Flipping along the XZ plane
                point_cloud[:, 1] = -1 * point_cloud[:, 1]
                target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

            # Rotation along X-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotx(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "x")

            # Rotation along Y-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = roty(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "y")

            # Rotation along up-axis/Z-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotz(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "z")

            # Translation
            point_cloud, target_bboxes = self._translate(
                point_cloud, target_bboxes)

            target_bboxes = np.concatenate([target_bboxes, instance_bboxes],
                                           axis=1)

        ### Build up referred targets' labels
        sample = {}
        for idx, bbox in enumerate(target_bboxes):
            if int(bbox[-1]) == object_id:
                gt_instance_id = bbox[-1]
                gt_semantic_id = bbox[-2]

                x_min = (2 * bbox[0] - bbox[3]) / 2
                x_max = (2 * bbox[0] + bbox[3]) / 2
                y_min = (2 * bbox[1] - bbox[4]) / 2
                y_max = (2 * bbox[1] + bbox[4]) / 2
                z_min = (2 * bbox[2] - bbox[5]) / 2
                z_max = (2 * bbox[2] + bbox[5]) / 2

                sample['point_cloud'] = point_cloud
                sample['object_name'] = object_name
                sample['corners'] = np.array(
                    [x_min, y_min, z_min, x_max, y_max,
                     z_max]).astype(np.float32)
                sample['class_id'] = float(bbox[-2])

                instance_seg = np.zeros_like(instance_labels)
                instance_seg[instance_labels == gt_instance_id] = 1
                sample['instance_seg'] = instance_seg.astype(np.float32)
                sample['description'] = np.array(description).astype(
                    np.float32)
                sample['original_description'] = original_description

        return sample
Esempio n. 5
0
    def __getitem__(self, idx):
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = int(self.scanrefer[idx]["ann_id"])
        object_cat = self.raw2label[
            object_name] if object_name in self.raw2label else 17

        # tokenize the description
        tokens = self.scanrefer[idx]["token"]
        embeddings = np.zeros((CONF.TRAIN.MAX_DES_LEN, 300))

        for token_id in range(CONF.TRAIN.MAX_DES_LEN):
            if token_id < len(tokens):
                token = tokens[token_id]
                if token.isspace():
                    continue
                if token in self.glove:
                    embeddings[token_id] = self.glove[token]
                else:
                    embeddings[token_id] = self.glove["unk"]

            else:
                break

        # get language features
        lang_feat = embeddings
        lang_token = tokens
        lang_len = len([token for token in lang_token if not token.isspace()])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_aligned_vert.npy")  # axis-aligned
        instance_labels = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_ins_label_pg.npy")
        semantic_labels = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_sem_label_pg.npy")
        instance_bboxes = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_aligned_bbox.npy")

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:,
                        3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:6]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            if not hasattr(self, 'multiview_data'):
                self.multiview_data = h5py.File(MULTIVIEW_DATA,
                                                "r",
                                                libver="latest",
                                                swmr=True)

            multiview = np.array(self.multiview_data[scene_id])
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target
        ref_center_label = np.zeros(3)  # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(
            3)  # bbox size residual for reference target
        scene_points = np.zeros((1, 10))

        if self.split != "test":
            num_bbox = instance_bboxes.shape[
                0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

            # ------------------------------- DATA AUGMENTATION ------------------------------
            if self.augment:
                if torch.rand(1).item() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:, 0] = -1 * point_cloud[:, 0]
                    target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

                if torch.rand(1).item() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:, 1] = -1 * point_cloud[:, 1]
                    target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

                # Rotation along X-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(
                    point_cloud, target_bboxes)

            # NOTE: set size class as semantic class. Consider use size2class.
            class_ind = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:num_bbox, -2]
            ]
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[
                0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]
        else:
            num_bbox = 1

        instance_points = []
        instance_class = []
        ref_target = []
        ins_obbs = []
        pts_batch = []
        pred_obbs = []
        for i_instance in np.unique(instance_labels):

            # find all points belong to that instance
            ind = np.nonzero(instance_labels == i_instance)[0]

            # find the semantic label
            ins_class = semantic_labels[ind[0]]
            if ins_class in DC.nyu40ids:
                x = point_cloud[ind]
                ins_class = DC.nyu40id2class[int(ins_class)]
                instance_class.append(ins_class)

                pc = x[:, :3]
                center = 0.5 * (pc.min(0) + pc.max(0))
                size = pc.max(0) - pc.min(0)
                ins_obb = np.concatenate((center, size, np.array([0])))
                ins_obbs.append(ins_obb)
                x = random_sampling(x, 1024)
                instance_points.append(x)

                if ins_class == object_cat:
                    pc = x[:, :3]
                    coords, feats = sparse_quantize(
                        pc, x, quantization_size=self.voxel_size_ap)
                    pt_inst = SparseTensor(feats, coords)

                    if len(ins_obb) < 2:
                        continue

                    pred_obbs.append(ins_obb)
                    pts_batch.append(pt_inst)

                if i_instance == (object_id + 1):
                    ref_target.append(1)
                else:
                    ref_target.append(0)
            else:
                scene_points = point_cloud[ind]

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        try:
            target_bboxes_semcls[0:num_bbox] = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:, -2][0:num_bbox]
            ]
        except KeyError:
            pass

        pc = point_cloud[:, :3]
        coords, feats = sparse_quantize(pc,
                                        point_cloud,
                                        quantization_size=self.voxel_size_glp)
        pt = SparseTensor(feats, coords)

        data_dict = {}
        data_dict['lidar'] = pt
        data_dict['pts_batch'] = pts_batch
        data_dict['pred_obb_batch'] = pred_obbs
        data_dict['scene_points'] = [scene_points]
        data_dict['point_min'] = point_cloud.min(0)[:3]
        data_dict['point_max'] = point_cloud.max(0)[:3]
        data_dict['instance_labels'] = instance_labels.astype(np.int64)
        data_dict['instance_points'] = instance_points
        data_dict['instance_class'] = instance_class
        data_dict['instance_obbs'] = ins_obbs
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32)  # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors
        data_dict["lang_token"] = lang_token
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description
        data_dict["center_label"] = target_bboxes.astype(
            np.float32)[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3)
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(ann_id).astype(np.int64)
        data_dict["object_cat"] = np.array(object_cat).astype(np.int64)
        data_dict["unique_multiple"] = np.array(
            self.unique_multiple_lookup[scene_id][str(object_id)][str(
                ann_id)]).astype(np.int64)

        return data_dict
    def trainMerge(self, id): 
        locs = []
        locs_float = []
        feats = []
        labels = []
        instance_labels = []

        instance_infos = []  # (N, 9)
        instance_pointnum = []  # (total_nInst), int

        target_instance_labels = []
        target_instance_pointnum = []

        lang_feats = []
        lang_lens = []
        lang_ids = []

        ann_ids = []
        object_ids = []
        object_classes = []

        batch_offsets = [0]

        total_inst_num = 0
        for i, idx in enumerate(id):

            #get object 
            scene_id = self.train_data[idx]["scene_id"]
            object_id = int(self.train_data[idx]["object_id"])
            object_name = " ".join(self.train_data[idx]["object_name"].split("_"))
            ann_id = self.train_data[idx]["ann_id"]

            #get language features
            lang_feat = self.lang[scene_id][str(object_id)][ann_id]
            lang_len = len(self.scanrefer[idx]["token"]) + 2
            lang_len = lang_len if lang_len <= cfg.TRAIN_MAX_DES_LEN + 2 else cfg.TRAIN_MAX_DES_LEN + 2

            #get scene data
            data_file = os.path.join(self.data_root,self.dataset,'{}_pointgroup.pth'.format(scene_id))
            xyz_origin, rgb, label, instance_label = torch.load(data_file)

            #instance_bboxes = np.load(os.path.join(self.data_root,self.dataset + '_votenet',scene_id)+'_aligned_bbox.npy')

            ### jitter / flip x / rotation
            xyz_middle = self.dataAugment(xyz_origin, True, True, True)

            ### scale
            xyz = xyz_middle * self.scale

            ### elastic
            xyz = self.elastic(xyz, 6 * self.scale // 50, 40 * self.scale / 50)
            xyz = self.elastic(xyz, 20 * self.scale // 50, 160 * self.scale / 50)

            ### offset
            xyz -= xyz.min(0)

            ### crop
            xyz, valid_idxs = random_sampling(xyz, self.max_npoint, return_choices=True)

            xyz_middle = xyz_middle[valid_idxs]
            rgb = rgb[valid_idxs]
            label = label[valid_idxs]
            instance_label = self.getCroppedInstLabel(instance_label, valid_idxs)

            ### get instance information
            inst_num, inst_infos, target_inst_pointnum = self.getInstanceInfo(xyz_middle, instance_label.astype(np.int32),object_id)
            inst_info = inst_infos["instance_info"]  # (n, 9), (cx, cy, cz, minx, miny, minz, maxx, maxy, maxz)
            inst_pointnum = inst_infos["instance_pointnum"]   # (nInst), list

            instance_label[np.where(instance_label != -100)] += total_inst_num

            #get target object information
            target_instance_id = object_id + total_inst_num
            target_instance_label = np.where(instance_label == target_instance_id, instance_label, -100) #only keep captioning target

            total_inst_num += inst_num

            ### merge the scene to the batch
            ann_ids.append(int(ann_id))
            object_ids.append(int(object_id))
            batch_offsets.append(batch_offsets[-1] + xyz.shape[0])

            locs.append(torch.cat([torch.LongTensor(xyz.shape[0], 1).fill_(i), torch.from_numpy(xyz).long()], 1))
            locs_float.append(torch.from_numpy(xyz_middle))
            
            feat = torch.from_numpy(rgb) + torch.randn(3) * 0.1
            if self.use_multiview:
                multiview = torch.from_numpy(self.multiview_data[scene_id][:])[valid_idxs]
                feat = torch.cat([feat,multiview],1)

            feats.append(feat)

            labels.append(torch.from_numpy(label))
            instance_labels.append(torch.from_numpy(instance_label))

            instance_infos.append(torch.from_numpy(inst_info))
            instance_pointnum.extend(inst_pointnum)

            target_instance_labels.append(torch.from_numpy(target_instance_label))
            target_instance_pointnum.append(target_inst_pointnum)

            lang_feats.append(torch.from_numpy(lang_feat).unsqueeze(0))
            lang_lens.append(lang_len)
            lang_ids.append(torch.from_numpy(self.lang_ids[scene_id][str(object_id)][ann_id]).unsqueeze(0))

        ### merge all the scenes in the batchd
        ann_ids = torch.tensor(ann_ids, dtype=torch.long)
        object_ids = torch.tensor(object_ids, dtype=torch.long)
        batch_offsets = torch.tensor(batch_offsets, dtype=torch.int)  # int (B+1)

        locs = torch.cat(locs, 0)                                # long (N, 1 + 3), the batch item idx is put in locs[:, 0]
        locs_float = torch.cat(locs_float, 0).to(torch.float32)  # float (N, 3)
        feats = torch.cat(feats, 0)                              # float (N, C)

        labels = torch.cat(labels, 0).long()                     # long (N)
        instance_labels = torch.cat(instance_labels, 0).long()   # long (N)

        instance_infos = torch.cat(instance_infos, 0).to(torch.float32)       # float (N, 9) (meanxyz, minxyz, maxxyz)
        instance_pointnum = torch.tensor(instance_pointnum, dtype=torch.int)  # int (total_nInst)

        target_instance_labels = torch.cat(target_instance_labels,0).long()
        target_instance_pointnum = torch.tensor(target_instance_pointnum, dtype=torch.int)

        lang_feats = torch.cat(lang_feats,0).to(torch.float32)
        lang_lens = torch.tensor(lang_lens,dtype=torch.long)
        lang_ids = torch.cat(lang_ids,0).to(torch.long)

        spatial_shape = np.clip((locs.max(0)[0][1:] + 1).numpy(), self.full_scale[0], None)     # long (3)

        ### voxelize
        voxel_locs, p2v_map, v2p_map = pointgroup_ops.voxelization_idx(locs, self.batch_size, self.mode)

        return {'locs': locs, 'voxel_locs': voxel_locs, 'p2v_map': p2v_map, 'v2p_map': v2p_map,
                'locs_float': locs_float, 'feats': feats, 'labels': labels, 'instance_labels': instance_labels,
                'instance_info': instance_infos, 'instance_pointnum': instance_pointnum,
                'target_instance_labels': target_instance_labels, 'target_instance_pointnum': target_instance_pointnum,
                'id': id, 'offsets': batch_offsets, 'spatial_shape': spatial_shape, 
                'lang_feat': lang_feats, 'lang_len': lang_lens, 'lang_ids': lang_ids, 
                'ann_id': ann_ids, 'object_id': object_ids }
    def trainMerge(self, id):
        start = time.time()

        ## PointGroup Input ##
        locs = []
        locs_float = []
        feats = []
        labels = []
        instance_labels = []

        instance_infos = []  # (N, 9)
        instance_pointnum = []  # (total_nInst), int

        batch_offsets = [0]

        total_inst_num = 0

        ## ScanRefer Input ##
        lang_feats = []
        lang_lens = []
        object_cats = []
        object_ids = []

        for i, idx in enumerate(id):
            scene_id = self.scanrefer[idx]["scene_id"]
            object_id = int(self.scanrefer[idx]["object_id"])
            object_name = " ".join(
                self.scanrefer[idx]["object_name"].split("_"))
            ann_id = self.scanrefer[idx]["ann_id"]

            # get language features
            lang_feat = self.lang[scene_id][str(object_id)][ann_id]
            lang_len = len(self.scanrefer[idx]["token"])
            lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

            # get pc
            mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
            instance_label = self.scene_data[scene_id]["instance_labels"]
            semantic_label = self.scene_data[scene_id]["semantic_labels"]
            instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

            if not self.use_color:
                point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
                pcl_color = mesh_vertices[:, 3:6]
            else:
                point_cloud = mesh_vertices[:, 0:6]
                point_cloud[:, 3:6] = (point_cloud[:, 3:6] -
                                       MEAN_COLOR_RGB) / 256.0
                pcl_color = point_cloud[:, 3:6]

            if self.use_normal:
                normals = mesh_vertices[:, 6:9]
                point_cloud = np.concatenate([point_cloud, normals], 1)

            if self.use_multiview:
                # load multiview database
                pid = mp.current_process().pid
                if pid not in self.multiview_data:
                    self.multiview_data[pid] = h5py.File(MULTIVIEW_DATA,
                                                         "r",
                                                         libver="latest")

                multiview = self.multiview_data[pid][scene_id]
                point_cloud = np.concatenate([point_cloud, multiview], 1)

            # if self.use_height:
            #     floor_height = np.percentile(point_cloud[:,2],0.99)
            #     height = point_cloud[:,2] - floor_height
            #     point_cloud = np.concatenate([point_cloud, np.expand_dims(height, 1)],1)

            # Prepare_data_inst.py (PG)
            point_cloud = np.ascontiguousarray(point_cloud -
                                               point_cloud.mean(0))
            pcl_color = np.ascontiguousarray(pcl_color[:, 3:6]) / 127.5 - 1
            #TODO: Random Sampling
            point_cloud, choices = random_sampling(point_cloud,
                                                   self.num_points,
                                                   return_choices=True)
            instance_label = instance_label[choices]
            semantic_label = semantic_label[choices]
            pcl_color = pcl_color[choices]

            xyz_origin = point_cloud
            label = semantic_label
            rgb = pcl_color
            if self.data_augmentation:
                # TODO: Data augmentation
                ### jitter / flip x / rotation
                xyz_middle = self.dataAugment(xyz_origin, True, True, True)

                ### scale
                xyz = xyz_middle * self.scale

                ### elastic
                xyz = self.elastic(xyz, 6 * self.scale // 50,
                                   40 * self.scale / 50)
                xyz = self.elastic(xyz, 20 * self.scale // 50,
                                   160 * self.scale / 50)

                ### offset
                xyz -= xyz.min(0)

                ### crop
                xyz, valid_idxs = self.crop(xyz)

                xyz_middle = xyz_middle[valid_idxs]
                xyz = xyz[valid_idxs]
                rgb = rgb[valid_idxs]
                label = label[valid_idxs]
                instance_label = self.getCroppedInstLabel(
                    instance_label, valid_idxs)

            ### get instance information
            if not self.data_augmentation:
                xyz_middle = self.dataAugment(xyz_origin, False, False, False)
                ### scale
                xyz = xyz_middle * self.scale
                ### offset
                xyz -= xyz.min(0)
                ### crop
                xyz, valid_idxs = self.crop(xyz)

                xyz_middle = xyz_middle[valid_idxs]
                xyz = xyz[valid_idxs]
                rgb = rgb[valid_idxs]
                label = label[valid_idxs]
                instance_label = self.getCroppedInstLabel(
                    instance_label, valid_idxs)

            inst_num, inst_infos = self.getInstanceInfo(
                xyz_middle, instance_label.astype(np.int32))
            inst_info = inst_infos[
                "instance_info"]  # (n, 9), (cx, cy, cz, minx, miny, minz, maxx, maxy, maxz)
            inst_pointnum = inst_infos["instance_pointnum"]  # (nInst), list

            instance_label[np.where(instance_label != -100)] += total_inst_num
            object_id += total_inst_num
            total_inst_num += inst_num

            ### merge the scene to the batch (PG)
            batch_offsets.append(batch_offsets[-1] + xyz.shape[0])

            locs.append(
                torch.cat([
                    torch.LongTensor(xyz.shape[0], 1).fill_(i),
                    torch.from_numpy(xyz).long()
                ], 1))
            locs_float.append(torch.from_numpy(xyz_middle))
            feats.append(torch.from_numpy(rgb))  #+ torch.randn(3) * 0.1)
            labels.append(torch.from_numpy(label))
            instance_labels.append(
                torch.from_numpy(instance_label.astype(np.int64)))

            instance_infos.append(torch.from_numpy(inst_info))
            instance_pointnum.extend(inst_pointnum)

            ### merge the scene to the batch (SR)
            # TODO: Check shape of lang_feats, lang_len, object_cats
            lang_feats.append(torch.from_numpy(lang_feat.astype(np.float32)))
            lang_lens.append(
                torch.from_numpy(np.array(lang_len).astype(np.int64)))
            object_cat = self.raw2label[
                object_name] if object_name in self.raw2label else 17
            object_cats.append(
                torch.from_numpy(np.array(object_cat).astype(np.int64)))
            object_ids.append(object_id)

        ### merge all the scenes in the batchd (PG)
        batch_offsets = torch.tensor(batch_offsets,
                                     dtype=torch.int)  # int (B+1)

        locs = torch.cat(
            locs,
            0)  # long (N, 1 + 3), the batch item idx is put in locs[:, 0]
        locs_float = torch.cat(locs_float, 0).to(torch.float32)  # float (N, 3)
        feats = torch.cat(feats, 0)  # float (N, C)
        labels = torch.cat(labels, 0).long()  # long (N)
        instance_labels = torch.cat(instance_labels, 0).long()  # long (N)

        instance_infos = torch.cat(instance_infos, 0).to(
            torch.float32)  # float (N, 9) (meanxyz, minxyz, maxxyz)
        instance_pointnum = torch.tensor(instance_pointnum,
                                         dtype=torch.int)  # int (total_nInst)

        spatial_shape = np.clip((locs.max(0)[0][1:] + 1).numpy(),
                                self.full_scale[0], None)  # long (3)

        ### voxelize
        voxel_locs, p2v_map, v2p_map = pointgroup_ops.voxelization_idx(
            locs, self.batch_size, self.mode)

        ### SC
        lang_feats = torch.cat(lang_feats, 0).reshape(
            (self.batch_size, 126, 300))  # float (B, 126, 300)
        lang_lens = torch.tensor(lang_lens, dtype=torch.int64)  # float (B, 1)
        object_cats = torch.tensor(object_cats)  # float (B, )
        object_ids = torch.tensor(object_ids, dtype=torch.int64)
        load_time = torch.from_numpy(np.array(time.time() - start))[None]

        return {
            'locs': locs,
            'locs_float': locs_float,
            'voxel_locs': voxel_locs,
            'p2v_map': p2v_map,
            'v2p_map': v2p_map,
            'feats': feats,
            'labels': labels,
            'instance_labels': instance_labels,
            'spatial_shape': spatial_shape,
            'instance_info': instance_infos,
            'instance_pointnum': instance_pointnum,
            'offsets': batch_offsets,
            "lang_feat": lang_feats,
            "lang_len": lang_lens,
            'object_id': object_ids,
            "load_time": load_time,
            "object_cat": object_cats
        }