Ejemplo n.º 1
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]
        
        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        point_cloud,pcl_color = self.process_pc(mesh_vertices,scene_id)
        
        if self.cp_aug and self.split != 'test':
            # Choose examples from other scene
            num_obj_add =  32 - instance_bboxes.shape[0]
            for i in range(num_obj_add):
                idx_other = random.randint(0,len(self.scanrefer)-1)
                while idx_other== idx:
                    idx_other = random.randint(0,len(self.scanrefer)-1)
                try:
                    other_scene_id = self.scanrefer[idx_other]["scene_id"]
                    other_object_id = int(self.scanrefer[idx_other]["object_id"])
                    other_object_name = " ".join(self.scanrefer[idx_other]["object_name"].split("_"))
                    other_ann_id = self.scanrefer[idx_other]["ann_id"]
                except IndexError:
                    print("Index Error: Selecting an index out of range")

                # get pc
                other_mesh_vertices = self.scene_data[other_scene_id]["mesh_vertices"]
                other_instance_labels = self.scene_data[other_scene_id]["instance_labels"]
                other_semantic_labels = self.scene_data[other_scene_id]["semantic_labels"]
                other_instance_bboxes = self.scene_data[other_scene_id]["instance_bboxes"]
                other_point_cloud,other_pcl_color = self.process_pc(other_mesh_vertices,scene_id)

                jitter_idx = 1#random.random()*0.45 +0.8 # Standard scale jittering

                # Random pick object and append to the current scene
                target_obj_label = random.randint(0,np.max(other_instance_labels)) # Find object based on id
                test_instance_labels, test_choices, flag_exceed = choose_label_pc(other_instance_labels, target_obj_label, 200, return_choices=True)
                test_instance_labels = np.empty(test_instance_labels.shape)
                
                other_point_cloud_jitter = other_point_cloud[test_choices].copy()
                # other_point_cloud_jitter[:,0:3] *= jitter_idx
                point_cloud     = np.concatenate((point_cloud,other_point_cloud_jitter),axis=0) 
                semantic_labels = np.concatenate((semantic_labels,other_semantic_labels[test_choices]),axis=0)
                pcl_color       = np.concatenate((pcl_color,other_pcl_color[test_choices]),axis=0)

                # Find the right box in other scene
                flag_add_instance = 0
                for i, gt_id in enumerate(other_instance_bboxes[:other_instance_bboxes.shape[0],-1]):
                    if gt_id == other_object_id:
                        select = other_instance_bboxes[i].copy()
                        select[-1] = np.max(instance_bboxes) +1
                        test_instance_labels.fill(np.max(instance_bboxes) +1)
                        instance_bboxes = np.concatenate((instance_bboxes,np.atleast_2d(select)),axis=0)
                        instance_labels = np.concatenate((instance_labels,test_instance_labels),axis=0)
                        flag_add_instance = 1
                        break
                if flag_add_instance == 0:
                    print("Warning: Did not add a box from another scene, something wrong")

        point_cloud, choices = random_sampling(point_cloud, self.num_points, return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        
        
        # ------------------------------- LABELS ------------------------------    
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))    
        angle_classes = np.zeros((MAX_NUM_OBJ,))
        angle_residuals = np.zeros((MAX_NUM_OBJ,))
        size_classes = np.zeros((MAX_NUM_OBJ,))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(MAX_NUM_OBJ) # bbox label for reference target
        ref_center_label = np.zeros(3) # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(3) # bbox size residual for reference target

        if self.split != "test":
            num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox,:] = instance_bboxes[:MAX_NUM_OBJ,0:6]

            point_votes = np.zeros([self.num_points, 3])
            point_votes_mask = np.zeros(self.num_points)

            # ------------------------------- DATA AUGMENTATION ------------------------------        
            if self.augment and not self.debug:
                if np.random.random() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:,0] = -1 * point_cloud[:,0]
                    target_bboxes[:,0] = -1 * target_bboxes[:,0]                
                    
                if np.random.random() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:,1] = -1 * point_cloud[:,1]
                    target_bboxes[:,1] = -1 * target_bboxes[:,1]                                

                # Rotation along X-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (np.random.random()*np.pi/18) - np.pi/36 # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:,0:3] = np.dot(point_cloud[:,0:3], np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(point_cloud, target_bboxes)

            # compute votes *AFTER* augmentation
            # generate votes
            # Note: since there's no map between bbox instance labels and
            # pc instance_labels (it had been filtered 
            # in the data preparation step) we'll compute the instance bbox
            # from the points sharing the same instance label. 
            for i_instance in np.unique(instance_labels):            
                # find all points belong to that instance
                ind = np.where(instance_labels == i_instance)[0]
                # find the semantic label            
                if semantic_labels[ind[0]] in DC.nyu40ids:
                    x = point_cloud[ind,:3]
                    center = 0.5*(x.min(0) + x.max(0))
                    point_votes[ind, :] = center - x
                    point_votes_mask[ind] = 1.0
            point_votes = np.tile(point_votes, (1, 3)) # make 3 votes identical 
            
            class_ind = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox,-2]]
            # NOTE: set size class as semantic class. Consider use size2class.
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox,-1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]
        else:
            num_bbox = 1
            point_votes = np.zeros([self.num_points, 9]) # make 3 votes identical 
            point_votes_mask = np.zeros(self.num_points)

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        try:
            target_bboxes_semcls[0:num_bbox] = [DC.nyu40id2class[int(x)] for x in instance_bboxes[:,-2][0:num_bbox]]
        except KeyError:
            pass

        object_cat = self.raw2label[object_name] if object_name in self.raw2label else 17

        data_dict = {}
        data_dict["point_clouds"] = point_cloud.astype(np.float32) # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(np.float32) # language feature vectors
        data_dict["lang_len"] = np.array(lang_len).astype(np.int64) # length of each description
        data_dict["center_label"] = target_bboxes.astype(np.float32)[:,0:3] # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(np.float32) # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(np.int64) # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(np.float32) # (MAX_NUM_OBJ, 3)
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(np.int64) # (MAX_NUM_OBJ,) semantic class index
        data_dict["box_label_mask"] = target_bboxes_mask.astype(np.float32) # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)
        data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        # data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox
        data_dict["ref_box_label"] = ref_box_label.astype(np.int64) # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)
        data_dict["object_cat"] = np.array(object_cat).astype(np.int64)
        data_dict["unique_multiple"] = np.array(self.unique_multiple_lookup[scene_id][str(object_id)][ann_id]).astype(np.int64)
        # data_dict["pcl_color"] = pcl_color
        data_dict["load_time"] = time.time() - start

        # data_dict["test_point_clouds"] = test_point_cloud.astype(np.float32)
        # data_dict["test_pcl_color"] = test_pcl_color

        return data_dict
Ejemplo n.º 2
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]

        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"]) + 2
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN + 2 else CONF.TRAIN.MAX_DES_LEN + 2

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:,
                        3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:6]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            if self.multiview_data == {}:
                self.multiview_data = h5py.File(MULTIVIEW_DATA,
                                                "r",
                                                libver="latest")

            multiview = self.multiview_data[scene_id]
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))

        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target
        ref_center_label = np.zeros(3)  # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(
            3)  # bbox size residual for reference target
        ref_box_corner_label = np.zeros((8, 3))

        if self.split != "test":
            num_bbox = instance_bboxes.shape[
                0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

            point_votes = np.zeros([self.num_points, 3])
            point_votes_mask = np.zeros(self.num_points)

            # ------------------------------- DATA AUGMENTATION ------------------------------
            if self.augment:
                if np.random.random() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:, 0] = -1 * point_cloud[:, 0]
                    target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

                if np.random.random() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:, 1] = -1 * point_cloud[:, 1]
                    target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

                # Rotation along X-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (np.random.random() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(
                    point_cloud, target_bboxes)

            # compute votes *AFTER* augmentation
            # generate votes
            # Note: since there's no map between bbox instance labels and
            # pc instance_labels (it had been filtered
            # in the data preparation step) we'll compute the instance bbox
            # from the points sharing the same instance label.
            for i_instance in np.unique(instance_labels):
                # find all points belong to that instance
                ind = np.where(instance_labels == i_instance)[0]
                # find the semantic label
                if semantic_labels[ind[0]] in DC.nyu40ids:
                    x = point_cloud[ind, :3]
                    center = 0.5 * (x.min(0) + x.max(0))
                    point_votes[ind, :] = center - x
                    point_votes_mask[ind] = 1.0
            point_votes = np.tile(point_votes,
                                  (1, 3))  # make 3 votes identical

            class_ind = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:num_bbox, -2]
            ]
            # NOTE: set size class as semantic class. Consider use size2class.
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[
                0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]

                    # construct ground truth box corner coordinates
                    ref_obb = DC.param2obb(ref_center_label,
                                           ref_heading_class_label,
                                           ref_heading_residual_label,
                                           ref_size_class_label,
                                           ref_size_residual_label)
                    ref_box_corner_label = get_3d_box(ref_obb[3:6], ref_obb[6],
                                                      ref_obb[0:3])

            # construct all GT bbox corners
            all_obb = DC.param2obb_batch(
                target_bboxes[:num_bbox,
                              0:3], angle_classes[:num_bbox].astype(np.int64),
                angle_residuals[:num_bbox],
                size_classes[:num_bbox].astype(np.int64),
                size_residuals[:num_bbox])
            all_box_corner_label = get_3d_box_batch(all_obb[:, 3:6],
                                                    all_obb[:, 6],
                                                    all_obb[:, 0:3])

            # store
            gt_box_corner_label = np.zeros((MAX_NUM_OBJ, 8, 3))
            gt_box_masks = np.zeros((MAX_NUM_OBJ, ))
            gt_box_object_ids = np.zeros((MAX_NUM_OBJ, ))

            gt_box_corner_label[:num_bbox] = all_box_corner_label
            gt_box_masks[:num_bbox] = 1
            gt_box_object_ids[:num_bbox] = instance_bboxes[:, -1]
        else:
            num_bbox = 1
            point_votes = np.zeros([self.num_points,
                                    9])  # make 3 votes identical
            point_votes_mask = np.zeros(self.num_points)

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        target_object_ids = np.zeros(
            (MAX_NUM_OBJ, ))  # object ids of all objects
        try:
            target_bboxes_semcls[0:num_bbox] = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:, -2][0:num_bbox]
            ]
            target_object_ids[0:num_bbox] = instance_bboxes[:, -1][0:num_bbox]
        except KeyError:
            pass

        object_cat = self.raw2label[
            object_name] if object_name in self.raw2label else 17

        data_dict = {}
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32
        )  # point cloud data including features    [B,max_num_points,3]
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors     [B,32,300]
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description    [B]
        data_dict["lang_ids"] = np.array(
            self.lang_ids[scene_id][str(object_id)][ann_id]).astype(
                np.int64)  #     [B,32,300]
        #all data with MAX_NUM_OBJ are mostly filled with zeros
        data_dict["center_label"] = target_bboxes.astype(
            np.float32
        )[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ  # [B,128,3]
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1  [B,128]
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,) [B,128]
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER  [B,128]
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3) [B,128,3]
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)  # [B]
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(
            np.int64)  # (MAX_NUM_OBJ,) semantic class index
        data_dict["scene_object_ids"] = target_object_ids.astype(
            np.int64)  # (MAX_NUM_OBJ,) object ids of all objects
        data_dict["box_label_mask"] = target_bboxes_mask.astype(
            np.float32)  # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)  # [B,40000,9]
        data_dict["vote_label_mask"] = point_votes_mask.astype(
            np.int64)  # [B,40000]
        data_dict["dataset_idx"] = np.array(idx).astype(
            np.int64)  # [B] object indices from self.scanrefer
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["ref_box_corner_label"] = ref_box_corner_label.astype(
            np.float64)  # target box corners NOTE type must be
        data_dict["gt_box_corner_label"] = gt_box_corner_label.astype(
            np.float64)  # all GT box corners NOTE type must be double
        data_dict["gt_box_masks"] = gt_box_masks.astype(
            np.int64)  # valid bbox masks
        data_dict["gt_box_object_ids"] = gt_box_object_ids.astype(
            np.int64)  # valid bbox object ids
        data_dict["object_id"] = np.array(int(object_id)).astype(
            np.int64)  # [B] target object_ids
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)  # [B]
        data_dict["object_cat"] = np.array(object_cat).astype(
            np.int64)  # [B] target object classes
        data_dict["unique_multiple"] = np.array(
            self.unique_multiple_lookup[scene_id][str(
                object_id)][ann_id]).astype(np.int64)
        data_dict["pcl_color"] = pcl_color  # [B,40000,3]
        data_dict["load_time"] = time.time() - start

        return data_dict
Ejemplo n.º 3
0
    def __getitem__(self, idx):
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = int(self.scanrefer[idx]["ann_id"])
        object_cat = self.raw2label[
            object_name] if object_name in self.raw2label else 17

        # tokenize the description
        tokens = self.scanrefer[idx]["token"]
        embeddings = np.zeros((CONF.TRAIN.MAX_DES_LEN, 300))

        for token_id in range(CONF.TRAIN.MAX_DES_LEN):
            if token_id < len(tokens):
                token = tokens[token_id]
                if token.isspace():
                    continue
                if token in self.glove:
                    embeddings[token_id] = self.glove[token]
                else:
                    embeddings[token_id] = self.glove["unk"]

            else:
                break

        # get language features
        lang_feat = embeddings
        lang_token = tokens
        lang_len = len([token for token in lang_token if not token.isspace()])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_aligned_vert.npy")  # axis-aligned
        instance_labels = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_ins_label_pg.npy")
        semantic_labels = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_sem_label_pg.npy")
        instance_bboxes = np.load(
            os.path.join(CONF.PATH.SCANNET_DATA, scene_id) +
            "_aligned_bbox.npy")

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:,
                        3:6] = (point_cloud[:, 3:6] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:6]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            if not hasattr(self, 'multiview_data'):
                self.multiview_data = h5py.File(MULTIVIEW_DATA,
                                                "r",
                                                libver="latest",
                                                swmr=True)

            multiview = np.array(self.multiview_data[scene_id])
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target
        ref_center_label = np.zeros(3)  # bbox center for reference target
        ref_heading_class_label = 0
        ref_heading_residual_label = 0
        ref_size_class_label = 0
        ref_size_residual_label = np.zeros(
            3)  # bbox size residual for reference target
        scene_points = np.zeros((1, 10))

        if self.split != "test":
            num_bbox = instance_bboxes.shape[
                0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
            target_bboxes_mask[0:num_bbox] = 1
            target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

            # ------------------------------- DATA AUGMENTATION ------------------------------
            if self.augment:
                if torch.rand(1).item() > 0.5:
                    # Flipping along the YZ plane
                    point_cloud[:, 0] = -1 * point_cloud[:, 0]
                    target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

                if torch.rand(1).item() > 0.5:
                    # Flipping along the XZ plane
                    point_cloud[:, 1] = -1 * point_cloud[:, 1]
                    target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

                # Rotation along X-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotx(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "x")

                # Rotation along Y-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = roty(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "y")

                # Rotation along up-axis/Z-axis
                rot_angle = (torch.rand(1).item() * np.pi /
                             18) - np.pi / 36  # -5 ~ +5 degree
                rot_mat = rotz(rot_angle)
                point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                             np.transpose(rot_mat))
                target_bboxes = rotate_aligned_boxes_along_axis(
                    target_bboxes, rot_mat, "z")

                # Translation
                point_cloud, target_bboxes = self._translate(
                    point_cloud, target_bboxes)

            # NOTE: set size class as semantic class. Consider use size2class.
            class_ind = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:num_bbox, -2]
            ]
            size_classes[0:num_bbox] = class_ind
            size_residuals[0:num_bbox, :] = target_bboxes[
                0:num_bbox, 3:6] - DC.mean_size_arr[class_ind, :]

            # construct the reference target label for each bbox
            ref_box_label = np.zeros(MAX_NUM_OBJ)
            for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
                if gt_id == object_id:
                    ref_box_label[i] = 1
                    ref_center_label = target_bboxes[i, 0:3]
                    ref_heading_class_label = angle_classes[i]
                    ref_heading_residual_label = angle_residuals[i]
                    ref_size_class_label = size_classes[i]
                    ref_size_residual_label = size_residuals[i]
        else:
            num_bbox = 1

        instance_points = []
        instance_class = []
        ref_target = []
        ins_obbs = []
        pts_batch = []
        pred_obbs = []
        for i_instance in np.unique(instance_labels):

            # find all points belong to that instance
            ind = np.nonzero(instance_labels == i_instance)[0]

            # find the semantic label
            ins_class = semantic_labels[ind[0]]
            if ins_class in DC.nyu40ids:
                x = point_cloud[ind]
                ins_class = DC.nyu40id2class[int(ins_class)]
                instance_class.append(ins_class)

                pc = x[:, :3]
                center = 0.5 * (pc.min(0) + pc.max(0))
                size = pc.max(0) - pc.min(0)
                ins_obb = np.concatenate((center, size, np.array([0])))
                ins_obbs.append(ins_obb)
                x = random_sampling(x, 1024)
                instance_points.append(x)

                if ins_class == object_cat:
                    pc = x[:, :3]
                    coords, feats = sparse_quantize(
                        pc, x, quantization_size=self.voxel_size_ap)
                    pt_inst = SparseTensor(feats, coords)

                    if len(ins_obb) < 2:
                        continue

                    pred_obbs.append(ins_obb)
                    pts_batch.append(pt_inst)

                if i_instance == (object_id + 1):
                    ref_target.append(1)
                else:
                    ref_target.append(0)
            else:
                scene_points = point_cloud[ind]

        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        try:
            target_bboxes_semcls[0:num_bbox] = [
                DC.nyu40id2class[int(x)]
                for x in instance_bboxes[:, -2][0:num_bbox]
            ]
        except KeyError:
            pass

        pc = point_cloud[:, :3]
        coords, feats = sparse_quantize(pc,
                                        point_cloud,
                                        quantization_size=self.voxel_size_glp)
        pt = SparseTensor(feats, coords)

        data_dict = {}
        data_dict['lidar'] = pt
        data_dict['pts_batch'] = pts_batch
        data_dict['pred_obb_batch'] = pred_obbs
        data_dict['scene_points'] = [scene_points]
        data_dict['point_min'] = point_cloud.min(0)[:3]
        data_dict['point_max'] = point_cloud.max(0)[:3]
        data_dict['instance_labels'] = instance_labels.astype(np.int64)
        data_dict['instance_points'] = instance_points
        data_dict['instance_class'] = instance_class
        data_dict['instance_obbs'] = ins_obbs
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32)  # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors
        data_dict["lang_token"] = lang_token
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description
        data_dict["center_label"] = target_bboxes.astype(
            np.float32)[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3)
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(ann_id).astype(np.int64)
        data_dict["object_cat"] = np.array(object_cat).astype(np.int64)
        data_dict["unique_multiple"] = np.array(
            self.unique_multiple_lookup[scene_id][str(object_id)][str(
                ann_id)]).astype(np.int64)

        return data_dict
Ejemplo n.º 4
0
    def _preprocess_sample(self, data):

        ### Get the data information in annotation item.
        scene_id = data['scene_id']
        object_id = int(data['object_id'])
        object_name = " ".join(data["object_name"].split("_"))
        ann_id = int(data["ann_id"])

        ### Get the referring expression
        description = data["indexed_token"]
        if len(description) > self.max_len:
            description = description[:self.max_len]
        else:
            description = description + [self.pad_token
                                         ] * (self.max_len - len(description))
        description = [self.sos_token] + description + [self.eos_token]

        original_description = data['description']

        ### Get the original annotation data.
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        ### Get point cloud data
        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]

            # Point cloud centering
            point_cloud[:, :3] = point_cloud[:, :3] - point_cloud[:, :3].mean(
                axis=0, keepdims=True)

            # Point cloud RGB scaling
            point_cloud[:, 3:] = (point_cloud[:, 3:] -
                                  self.cfg.TRAINING.MEAN_COLOR_RGB) / 255.0
            # point_cloud[:,3:] = point_cloud[:,3:] * 2.7 / 255.0

            pcl_color = point_cloud[:, 3:]

        ### Sampling points
        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        ### Specify the number of box we need to predict and create a mask for it.
        # num_bbox = instance_bboxes.shape[0] if instance_bboxes.shape[0] < self.max_num_obj else self.max_num_obj
        # target_bboxes_mask = np.zeros((self.max_num_obj))
        # target_bboxes_mask[0:num_bbox] = 1
        # target_bboxes = instance_bboxes[:num_bbox, 0:6]
        target_bboxes = instance_bboxes

        ### Data augmentation (*Warning: after augmenting, target_boxes will be left only 6 element dimension)
        if self.augment and not self.debug:
            if np.random.random() > 0.5:
                # Flipping along the YZ plane
                point_cloud[:, 0] = -1 * point_cloud[:, 0]
                target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

            if np.random.random() > 0.5:
                # Flipping along the XZ plane
                point_cloud[:, 1] = -1 * point_cloud[:, 1]
                target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

            # Rotation along X-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotx(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "x")

            # Rotation along Y-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = roty(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "y")

            # Rotation along up-axis/Z-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotz(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "z")

            # Translation
            point_cloud, target_bboxes = self._translate(
                point_cloud, target_bboxes)

            target_bboxes = np.concatenate([target_bboxes, instance_bboxes],
                                           axis=1)

        ### Build up referred targets' labels
        sample = {}
        for idx, bbox in enumerate(target_bboxes):
            if int(bbox[-1]) == object_id:
                gt_instance_id = bbox[-1]
                gt_semantic_id = bbox[-2]

                x_min = (2 * bbox[0] - bbox[3]) / 2
                x_max = (2 * bbox[0] + bbox[3]) / 2
                y_min = (2 * bbox[1] - bbox[4]) / 2
                y_max = (2 * bbox[1] + bbox[4]) / 2
                z_min = (2 * bbox[2] - bbox[5]) / 2
                z_max = (2 * bbox[2] + bbox[5]) / 2

                sample['point_cloud'] = point_cloud
                sample['object_name'] = object_name
                sample['corners'] = np.array(
                    [x_min, y_min, z_min, x_max, y_max,
                     z_max]).astype(np.float32)
                sample['class_id'] = float(bbox[-2])

                instance_seg = np.zeros_like(instance_labels)
                instance_seg[instance_labels == gt_instance_id] = 1
                sample['instance_seg'] = instance_seg.astype(np.float32)
                sample['description'] = np.array(description).astype(
                    np.float32)
                sample['original_description'] = original_description

        return sample
Ejemplo n.º 5
0
    def __getitem__(self, idx):
        start = time.time()
        scene_id = self.scanrefer[idx]["scene_id"]
        object_id = int(self.scanrefer[idx]["object_id"])
        object_name = " ".join(self.scanrefer[idx]["object_name"].split("_"))
        ann_id = self.scanrefer[idx]["ann_id"]

        # get language features
        lang_feat = self.lang[scene_id][str(object_id)][ann_id]
        lang_len = len(self.scanrefer[idx]["token"])
        lang_len = lang_len if lang_len <= CONF.TRAIN.MAX_DES_LEN else CONF.TRAIN.MAX_DES_LEN

        # get pc
        mesh_vertices = self.scene_data[scene_id]["mesh_vertices"]
        instance_labels = self.scene_data[scene_id]["instance_labels"]
        semantic_labels = self.scene_data[scene_id]["semantic_labels"]
        instance_bboxes = self.scene_data[scene_id]["instance_bboxes"]

        if not self.use_color:
            point_cloud = mesh_vertices[:, 0:3]  # do not use color for now
            pcl_color = mesh_vertices[:, 3:6]
        else:
            point_cloud = mesh_vertices[:, 0:6]
            point_cloud[:, 3:] = (point_cloud[:, 3:] - MEAN_COLOR_RGB) / 256.0
            pcl_color = point_cloud[:, 3:]

        if self.use_normal:
            normals = mesh_vertices[:, 6:9]
            point_cloud = np.concatenate([point_cloud, normals], 1)

        if self.use_multiview:
            # load multiview database
            pid = mp.current_process().pid
            if pid not in self.multiview_data:
                self.multiview_data[pid] = h5py.File(MULTIVIEW_DATA,
                                                     "r",
                                                     libver="latest")

            multiview = self.multiview_data[pid][scene_id]
            point_cloud = np.concatenate([point_cloud, multiview], 1)

        if self.use_height:
            floor_height = np.percentile(point_cloud[:, 2], 0.99)
            height = point_cloud[:, 2] - floor_height
            point_cloud = np.concatenate(
                [point_cloud, np.expand_dims(height, 1)], 1)

        # ------------------------------- LABELS ------------------------------
        target_bboxes = np.zeros((MAX_NUM_OBJ, 6))
        target_bboxes_mask = np.zeros((MAX_NUM_OBJ))
        angle_classes = np.zeros((MAX_NUM_OBJ, ))
        angle_residuals = np.zeros((MAX_NUM_OBJ, ))
        size_classes = np.zeros((MAX_NUM_OBJ, ))
        size_residuals = np.zeros((MAX_NUM_OBJ, 3))
        ref_box_label = np.zeros(
            MAX_NUM_OBJ)  # bbox label for reference target

        point_cloud, choices = random_sampling(point_cloud,
                                               self.num_points,
                                               return_choices=True)
        instance_labels = instance_labels[choices]
        semantic_labels = semantic_labels[choices]
        pcl_color = pcl_color[choices]

        num_bbox = instance_bboxes.shape[
            0] if instance_bboxes.shape[0] < MAX_NUM_OBJ else MAX_NUM_OBJ
        target_bboxes_mask[0:num_bbox] = 1
        target_bboxes[0:num_bbox, :] = instance_bboxes[:MAX_NUM_OBJ, 0:6]

        # ------------------------------- DATA AUGMENTATION ------------------------------
        if self.augment and not self.debug:
            if np.random.random() > 0.5:
                # Flipping along the YZ plane
                point_cloud[:, 0] = -1 * point_cloud[:, 0]
                target_bboxes[:, 0] = -1 * target_bboxes[:, 0]

            if np.random.random() > 0.5:
                # Flipping along the XZ plane
                point_cloud[:, 1] = -1 * point_cloud[:, 1]
                target_bboxes[:, 1] = -1 * target_bboxes[:, 1]

            # Rotation along X-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotx(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "x")

            # Rotation along Y-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = roty(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "y")

            # Rotation along up-axis/Z-axis
            rot_angle = (np.random.random() * np.pi /
                         18) - np.pi / 36  # -5 ~ +5 degree
            rot_mat = rotz(rot_angle)
            point_cloud[:, 0:3] = np.dot(point_cloud[:, 0:3],
                                         np.transpose(rot_mat))
            target_bboxes = rotate_aligned_boxes_along_axis(
                target_bboxes, rot_mat, "z")

            # Translation
            point_cloud, target_bboxes = self._translate(
                point_cloud, target_bboxes)

        # compute votes *AFTER* augmentation
        # generate votes
        # Note: since there's no map between bbox instance labels and
        # pc instance_labels (it had been filtered
        # in the data preparation step) we'll compute the instance bbox
        # from the points sharing the same instance label.
        point_votes = np.zeros([self.num_points, 3])
        point_votes_mask = np.zeros(self.num_points)
        for i_instance in np.unique(instance_labels):
            # find all points belong to that instance
            ind = np.where(instance_labels == i_instance)[0]
            # find the semantic label
            if semantic_labels[ind[0]] in DC.nyu40ids:
                x = point_cloud[ind, :3]
                center = 0.5 * (x.min(0) + x.max(0))
                point_votes[ind, :] = center - x
                point_votes_mask[ind] = 1.0
        point_votes = np.tile(point_votes, (1, 3))  # make 3 votes identical

        class_ind = [
            DC.nyu40id2class[int(x)] for x in instance_bboxes[:num_bbox, -2]
        ]
        # NOTE: set size class as semantic class. Consider use size2class.
        size_classes[0:num_bbox] = class_ind
        size_residuals[0:num_bbox, :] = \
            target_bboxes[0:num_bbox, 3:6] - DC.mean_size_arr[class_ind,:]

        # construct the reference target label for each bbox
        ref_box_label = np.zeros(MAX_NUM_OBJ)
        for i, gt_id in enumerate(instance_bboxes[:num_bbox, -1]):
            if gt_id == object_id:
                ref_box_label[i] = 1
                ref_center_label = target_bboxes[i, 0:3]
                ref_heading_class_label = angle_classes[i]
                ref_heading_residual_label = angle_residuals[i]
                ref_size_class_label = size_classes[i]
                ref_size_residual_label = size_residuals[i]

        data_dict = {}
        data_dict["point_clouds"] = point_cloud.astype(
            np.float32)  # point cloud data including features
        data_dict["lang_feat"] = lang_feat.astype(
            np.float32)  # language feature vectors
        data_dict["lang_len"] = np.array(lang_len).astype(
            np.int64)  # length of each description
        data_dict["center_label"] = target_bboxes.astype(
            np.float32)[:, 0:3]  # (MAX_NUM_OBJ, 3) for GT box center XYZ
        data_dict["heading_class_label"] = angle_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_HEADING_BIN-1
        data_dict["heading_residual_label"] = angle_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ,)
        data_dict["size_class_label"] = size_classes.astype(
            np.int64
        )  # (MAX_NUM_OBJ,) with int values in 0,...,NUM_SIZE_CLUSTER
        data_dict["size_residual_label"] = size_residuals.astype(
            np.float32)  # (MAX_NUM_OBJ, 3)
        target_bboxes_semcls = np.zeros((MAX_NUM_OBJ))
        target_bboxes_semcls[0:num_bbox] = [
            DC.nyu40id2class[int(x)]
            for x in instance_bboxes[:, -2][0:num_bbox]
        ]
        data_dict["num_bbox"] = np.array(num_bbox).astype(np.int64)
        data_dict["sem_cls_label"] = target_bboxes_semcls.astype(
            np.int64)  # (MAX_NUM_OBJ,) semantic class index
        data_dict["box_label_mask"] = target_bboxes_mask.astype(
            np.float32)  # (MAX_NUM_OBJ) as 0/1 with 1 indicating a unique box
        data_dict["vote_label"] = point_votes.astype(np.float32)
        data_dict["vote_label_mask"] = point_votes_mask.astype(np.int64)
        data_dict["scan_idx"] = np.array(idx).astype(np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_box_label"] = ref_box_label.astype(
            np.int64)  # 0/1 reference labels for each object bbox
        data_dict["ref_center_label"] = ref_center_label.astype(np.float32)
        data_dict["ref_heading_class_label"] = np.array(
            int(ref_heading_class_label)).astype(np.int64)
        data_dict["ref_heading_residual_label"] = np.array(
            int(ref_heading_residual_label)).astype(np.int64)
        data_dict["ref_size_class_label"] = np.array(
            int(ref_size_class_label)).astype(np.int64)
        data_dict["ref_size_residual_label"] = ref_size_residual_label.astype(
            np.float32)
        data_dict["object_id"] = np.array(int(object_id)).astype(np.int64)
        data_dict["ann_id"] = np.array(int(ann_id)).astype(np.int64)
        data_dict["object_cat"] = np.array(self.raw2label[object_name]).astype(
            np.int64)
        data_dict["pcl_color"] = pcl_color
        data_dict["load_time"] = time.time() - start

        return data_dict