Exemple #1
0
def get_object_masks(nuscenes, sample_data, extents, resolution):

    # Initialize object masks
    nclass = len(DETECTION_NAMES) + 1
    grid_width = int((extents[2] - extents[0]) / resolution)
    grid_height = int((extents[3] - extents[1]) / resolution)
    masks = np.zeros((nclass, grid_height, grid_width), dtype=np.uint8)

    # Get the 2D affine transform from bev coords to map coords
    tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]]
    inv_tfm = np.linalg.inv(tfm)

    for box in nuscenes.get_boxes(sample_data['token']):

        # Get the index of the class
        det_name = category_to_detection_name(box.name)
        if det_name not in DETECTION_NAMES:
            class_id = -1
        else:
            class_id = DETECTION_NAMES.index(det_name)

        # Get bounding box coordinates in the grid coordinate frame
        bbox = box.bottom_corners()[:2]
        local_bbox = np.dot(inv_tfm[:2, :2], bbox).T + inv_tfm[:2, 2]

        # Render the rotated bounding box to the mask
        render_polygon(masks[class_id], local_bbox, extents, resolution)

    return masks.astype(np.bool)
Exemple #2
0
 def random_class(category_name: str) -> str:
     # Alter 10% of the valid labels.
     class_names = sorted(DETECTION_NAMES)
     tmp = category_to_detection_name(category_name)
     if tmp is not None and np.random.rand() < .9:
         return tmp
     else:
         return class_names[np.random.randint(0, len(class_names) - 1)]
Exemple #3
0
 def random_class(category_name):
     class_names = ['barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle', 'pedestrian',
                    'traffic_cone', 'trailer', 'truck']
     tmp = category_to_detection_name(category_name)
     if tmp is not None and np.random.rand() < .9:
         return tmp
     else:
         return class_names[np.random.randint(0, 9)]
Exemple #4
0
def parse_labels(
    data: NuScenes,
    boxes: List[Box],
    ego_pose: DictStrAny,
    calib_sensor: DictStrAny,
    img_size: Optional[Tuple[int, int]] = None,
) -> Optional[List[Label]]:
    """Parse NuScenes labels into sensor frame."""
    if len(boxes):
        labels = []
        # transform into the sensor coord system
        transform_boxes(boxes, ego_pose, calib_sensor)
        intrinsic_matrix: NDArrayF64 = np.array(
            calib_sensor["camera_intrinsic"], dtype=np.float64)
        for box in boxes:
            box_class = category_to_detection_name(box.name)
            in_image = True
            if img_size is not None:
                in_image = box_in_image(box, intrinsic_matrix, img_size)

            if in_image and box_class is not None:
                xyz = tuple(box.center.tolist())
                w, l, h = box.wlh
                roty = quaternion_to_yaw(box.orientation)

                box2d = None
                if img_size is not None:
                    # Project 3d box to 2d.
                    corners = box.corners()
                    corner_coords = (view_points(corners, intrinsic_matrix,
                                                 True).T[:, :2].tolist())
                    # Keep only corners that fall within the image, transform
                    box2d = xyxy_to_box2d(*post_process_coords(corner_coords))

                instance_data = data.get("sample_annotation", box.token)
                # Attributes can be retrieved via instance_data and also the
                # category is more fine-grained than box_class.
                # This information could be stored in attributes if needed in
                # the future
                label = Label(
                    id=instance_data["instance_token"],
                    category=box_class,
                    box2d=box2d,
                    box3d=Box3D(
                        location=xyz,
                        dimension=(h, w, l),
                        orientation=(0, roty, 0),
                        alpha=rotation_y_to_alpha(roty, xyz),  # type: ignore
                    ),
                )
                labels.append(label)

        return labels
    return None
Exemple #5
0
def load_gt(nusc, eval_split: str, verbose: bool = False) -> EvalBoxes:
    """ Loads ground truth boxes from DB. """

    # Init.
    attribute_map = {a['token']: a['name'] for a in nusc.attribute}

    if verbose:
        print('Loading annotations for {} split from nuScenes version: {}'.
              format(eval_split, nusc.version))
    # Read out all sample_tokens in DB.
    sample_tokens_all = [s['token'] for s in nusc.sample]
    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"

    # Only keep samples from this split.
    splits = create_splits_scenes()

    # Check compatibility of split with nusc_version.
    version = nusc.version
    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
        assert version.endswith('trainval'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    elif eval_split in {'mini_train', 'mini_val'}:
        assert version.endswith('mini'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    elif eval_split == 'test':
        assert version.endswith('test'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    else:
        raise ValueError(
            'Error: Requested split {} which this function cannot map to the correct NuScenes version.'
            .format(eval_split))

    if eval_split == 'test':
        # Check that you aren't trying to cheat :).
        assert len(nusc.sample_annotation) > 0, \
            'Error: You are trying to evaluate on the test set but you do not have the annotations!'

    sample_tokens = []
    for sample_token in sample_tokens_all:
        scene_token = nusc.get('sample', sample_token)['scene_token']
        scene_record = nusc.get('scene', scene_token)
        if scene_record['name'] in splits[eval_split]:
            sample_tokens.append(sample_token)

    all_annotations = EvalBoxes()

    # Load annotations and filter predictions and annotations.
    for sample_token in tqdm.tqdm(sample_tokens):

        sample = nusc.get('sample', sample_token)
        sample_annotation_tokens = sample['anns']

        sample_boxes = []
        for sample_annotation_token in sample_annotation_tokens:

            # Get label name in detection task and filter unused labels.
            sample_annotation = nusc.get('sample_annotation',
                                         sample_annotation_token)
            detection_name = category_to_detection_name(
                sample_annotation['category_name'])
            if detection_name is None:
                continue

            # Get attribute_name.
            attr_tokens = sample_annotation['attribute_tokens']
            attr_count = len(attr_tokens)
            if attr_count == 0:
                attribute_name = ''
            elif attr_count == 1:
                attribute_name = attribute_map[attr_tokens[0]]
            else:
                raise Exception(
                    'Error: GT annotations must not have more than one attribute!'
                )

            sample_boxes.append(
                EvalBox(
                    sample_token=sample_token,
                    translation=sample_annotation['translation'],
                    size=sample_annotation['size'],
                    rotation=sample_annotation['rotation'],
                    velocity=nusc.box_velocity(sample_annotation['token'])[:2],
                    detection_name=detection_name,
                    detection_score=-1.0,  # GT samples do not have a score.
                    attribute_name=attribute_name,
                    num_pts=sample_annotation['num_lidar_pts'] +
                    sample_annotation['num_radar_pts']))
        all_annotations.add_boxes(sample_token, sample_boxes)

    if verbose:
        print("Loaded ground truth annotations for {} samples.".format(
            len(all_annotations.sample_tokens)))

    return all_annotations
Exemple #6
0
def load_gt(nusc: NuScenes,
            eval_split: str,
            box_cls,
            verbose: bool = False) -> EvalBoxes:
    """
    Loads ground truth boxes from DB.
    :param nusc: A NuScenes instance.
    :param eval_split: The evaluation split for which we load GT boxes.
    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
    :param verbose: Whether to print messages to stdout.
    :return: The GT boxes.
    """
    # Init.
    if box_cls == DetectionBox:
        attribute_map = {a['token']: a['name'] for a in nusc.attribute}

    if verbose:
        print('Loading annotations for {} split from nuScenes version: {}'.
              format(eval_split, nusc.version))
    # Read out all sample_tokens in DB.
    sample_tokens_all = [s['token'] for s in nusc.sample]
    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"

    # Only keep samples from this split.
    splits = create_splits_scenes()

    # Check compatibility of split with nusc_version.
    version = nusc.version
    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
        assert version.endswith('trainval'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    elif eval_split in {'mini_train', 'mini_val'}:
        assert version.endswith('mini'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    elif eval_split == 'test':
        assert version.endswith('test'), \
            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
    else:
        raise ValueError(
            'Error: Requested split {} which this function cannot map to the correct NuScenes version.'
            .format(eval_split))

    if eval_split == 'test':
        # Check that you aren't trying to cheat :).
        assert len(nusc.sample_annotation) > 0, \
            'Error: You are trying to evaluate on the test set but you do not have the annotations!'

    sample_tokens = []
    for sample_token in sample_tokens_all:
        scene_token = nusc.get('sample', sample_token)['scene_token']
        scene_record = nusc.get('scene', scene_token)
        if scene_record['name'] in splits[eval_split]:
            sample_tokens.append(sample_token)

    all_annotations = EvalBoxes()

    # Load annotations and filter predictions and annotations.
    tracking_id_set = set()
    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):

        sample = nusc.get('sample', sample_token)
        sample_annotation_tokens = sample['anns']

        sample_boxes = []
        for sample_annotation_token in sample_annotation_tokens:

            sample_annotation = nusc.get('sample_annotation',
                                         sample_annotation_token)
            if box_cls == DetectionBox:
                # Get label name in detection task and filter unused labels.
                detection_name = category_to_detection_name(
                    sample_annotation['category_name'])
                if detection_name is None:
                    continue

                # Get attribute_name.
                attr_tokens = sample_annotation['attribute_tokens']
                attr_count = len(attr_tokens)
                if attr_count == 0:
                    attribute_name = ''
                elif attr_count == 1:
                    attribute_name = attribute_map[attr_tokens[0]]
                else:
                    raise Exception(
                        'Error: GT annotations must not have more than one attribute!'
                    )

                sample_boxes.append(
                    box_cls(
                        sample_token=sample_token,
                        translation=sample_annotation['translation'],
                        size=sample_annotation['size'],
                        rotation=sample_annotation['rotation'],
                        velocity=nusc.box_velocity(
                            sample_annotation['token'])[:2],
                        num_pts=sample_annotation['num_lidar_pts'] +
                        sample_annotation['num_radar_pts'],
                        detection_name=detection_name,
                        detection_score=-1.0,  # GT samples do not have a score.
                        attribute_name=attribute_name))
            elif box_cls == TrackingBox:
                # Use nuScenes token as tracking id.
                tracking_id = sample_annotation['instance_token']
                tracking_id_set.add(tracking_id)

                # Get label name in detection task and filter unused labels.
                tracking_name = category_to_tracking_name(
                    sample_annotation['category_name'])
                if tracking_name is None:
                    continue

                sample_boxes.append(
                    box_cls(
                        sample_token=sample_token,
                        translation=sample_annotation['translation'],
                        size=sample_annotation['size'],
                        rotation=sample_annotation['rotation'],
                        velocity=nusc.box_velocity(
                            sample_annotation['token'])[:2],
                        num_pts=sample_annotation['num_lidar_pts'] +
                        sample_annotation['num_radar_pts'],
                        tracking_id=tracking_id,
                        tracking_name=tracking_name,
                        tracking_score=-1.0  # GT samples do not have a score.
                    ))
            else:
                raise NotImplementedError('Error: Invalid box_cls %s!' %
                                          box_cls)

        all_annotations.add_boxes(sample_token, sample_boxes)

    if verbose:
        print("Loaded ground truth annotations for {} samples.".format(
            len(all_annotations.sample_tokens)))

    return all_annotations
Exemple #7
0
def preprocess(nusc,
               split_names,
               root_dir,
               out_dir,
               keyword=None,
               keyword_action=None,
               subset_name=None,
               location=None):
    # cannot process day/night and location at the same time
    assert not (bool(keyword) and bool(location))
    if keyword:
        assert keyword_action in ['filter', 'exclude']

    # init dict to save
    pkl_dict = {}
    for split_name in split_names:
        pkl_dict[split_name] = []

    for i, sample in enumerate(nusc.sample):
        curr_scene_name = nusc.get('scene', sample['scene_token'])['name']

        # get if the current scene is in train, val or test
        curr_split = None
        for split_name in split_names:
            if curr_scene_name in getattr(splits, split_name):
                curr_split = split_name
                break
        if curr_split is None:
            continue

        if subset_name == 'night':
            if curr_split == 'train':
                if curr_scene_name in splits.val_night:
                    curr_split = 'val'
        if subset_name == 'singapore':
            if curr_split == 'train':
                if curr_scene_name in splits.val_singapore:
                    curr_split = 'val'

        # filter for day/night
        if keyword:
            scene_description = nusc.get("scene",
                                         sample["scene_token"])["description"]
            if keyword.lower() in scene_description.lower():
                if keyword_action == 'exclude':
                    # skip sample
                    continue
            else:
                if keyword_action == 'filter':
                    # skip sample
                    continue

        if location:
            scene = nusc.get("scene", sample["scene_token"])
            if location not in nusc.get("log", scene['log_token'])['location']:
                continue

        lidar_token = sample["data"]["LIDAR_TOP"]
        cam_front_token = sample["data"]["CAM_FRONT"]
        lidar_path, boxes_lidar, _ = nusc.get_sample_data(lidar_token)
        cam_path, boxes_front_cam, cam_intrinsic = nusc.get_sample_data(
            cam_front_token)

        print('{}/{} {} {}'.format(i + 1, len(nusc.sample), curr_scene_name,
                                   lidar_path))

        sd_rec_lidar = nusc.get('sample_data', sample['data']["LIDAR_TOP"])
        cs_record_lidar = nusc.get('calibrated_sensor',
                                   sd_rec_lidar['calibrated_sensor_token'])
        pose_record_lidar = nusc.get('ego_pose',
                                     sd_rec_lidar['ego_pose_token'])
        sd_rec_cam = nusc.get('sample_data', sample['data']["CAM_FRONT"])
        cs_record_cam = nusc.get('calibrated_sensor',
                                 sd_rec_cam['calibrated_sensor_token'])
        pose_record_cam = nusc.get('ego_pose', sd_rec_cam['ego_pose_token'])

        calib_infos = {
            "lidar2ego_translation": cs_record_lidar['translation'],
            "lidar2ego_rotation": cs_record_lidar['rotation'],
            "ego2global_translation_lidar": pose_record_lidar['translation'],
            "ego2global_rotation_lidar": pose_record_lidar['rotation'],
            "ego2global_translation_cam": pose_record_cam['translation'],
            "ego2global_rotation_cam": pose_record_cam['rotation'],
            "cam2ego_translation": cs_record_cam['translation'],
            "cam2ego_rotation": cs_record_cam['rotation'],
            "cam_intrinsic": cam_intrinsic,
        }

        # load lidar points
        pts = np.fromfile(lidar_path, dtype=np.float32,
                          count=-1).reshape([-1, 5])[:, :3].T

        # map point cloud into front camera image
        pts_valid_flag, pts_cam_coord, pts_img = map_pointcloud_to_image(
            pts, (900, 1600, 3), calib_infos)
        # fliplr so that indexing is row, col and not col, row
        pts_img = np.ascontiguousarray(np.fliplr(pts_img))

        # only use lidar points in the front camera image
        pts = pts[:, pts_valid_flag]

        num_pts = pts.shape[1]
        seg_labels = np.full(num_pts,
                             fill_value=len(class_names_to_id),
                             dtype=np.uint8)
        # only use boxes that are visible in camera
        valid_box_tokens = [box.token for box in boxes_front_cam]
        boxes = [box for box in boxes_lidar if box.token in valid_box_tokens]
        for box in boxes:
            # get points that lie inside of the box
            fg_mask = points_in_box(box, pts)
            det_class = category_to_detection_name(box.name)
            if det_class is not None:
                seg_labels[fg_mask] = class_names_to_id[det_class]

        # convert to relative path
        lidar_path = lidar_path.replace(root_dir + '/', '')
        cam_path = cam_path.replace(root_dir + '/', '')

        # transpose to yield shape (num_points, 3)
        pts = pts.T

        # append data to train, val or test list in pkl_dict
        data_dict = {
            'points': pts,
            'seg_labels': seg_labels,
            'points_img': pts_img,  # row, col format, shape: (num_points, 2)
            'lidar_path': lidar_path,
            'camera_path': cam_path,
            'boxes': boxes_lidar,
            "sample_token": sample["token"],
            "scene_name": curr_scene_name,
            "calib": calib_infos
        }
        pkl_dict[curr_split].append(data_dict)

    # save to pickle file
    save_dir = osp.join(out_dir, 'preprocess')
    os.makedirs(save_dir, exist_ok=True)
    for split_name in split_names:
        save_path = osp.join(
            save_dir,
            '{}{}.pkl'.format(split_name,
                              '_' + subset_name if subset_name else ''))
        with open(save_path, 'wb') as f:
            pickle.dump(pkl_dict[split_name], f)
            print('Wrote preprocessed data to ' + save_path)
Exemple #8
0
    def load_annotations(self, idx):
        annotations = []
        """
        Converts nuScenes GT annotations to KITTI format.
        """
        kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2)
        kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse

        sample_token = self.sample_tokens[idx]
        sample = self.nusc.get('sample', sample_token)
        sample_annotation_tokens = sample['anns']

        cam_front_token = sample['data'][self.cam_name]
        lidar_token = sample['data'][self.lidar_name]

        # Retrieve sensor records.
        sd_record_cam = self.nusc.get('sample_data', cam_front_token)
        sd_record_lid = self.nusc.get('sample_data', lidar_token)
        cs_record_cam = self.nusc.get('calibrated_sensor',
                                      sd_record_cam['calibrated_sensor_token'])
        cs_record_lid = self.nusc.get('calibrated_sensor',
                                      sd_record_lid['calibrated_sensor_token'])

        # Combine transformations and convert to KITTI format.
        # Note: cam uses same conventions in KITTI and nuScenes.
        lid_to_ego = transform_matrix(cs_record_lid['translation'],
                                      Quaternion(cs_record_lid['rotation']),
                                      inverse=False)
        ego_to_cam = transform_matrix(cs_record_cam['translation'],
                                      Quaternion(cs_record_cam['rotation']),
                                      inverse=True)
        velo_to_cam = np.dot(ego_to_cam, lid_to_ego)

        # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam.
        velo_to_cam_kitti = np.dot(velo_to_cam,
                                   kitti_to_nu_lidar.transformation_matrix)

        # Currently not used.
        imu_to_velo_kitti = np.zeros((3, 4), dtype=np.float32)  # Dummy values.
        r0_rect = Quaternion(axis=[1, 0, 0], angle=0)  # Dummy values.

        # Projection matrix.
        p_left_kitti = np.zeros((3, 4), dtype=np.float32)
        p_left_kitti[:3, :3] = cs_record_cam[
            'camera_intrinsic']  # Cameras are always rectified.
        p_left_kitti = p_left_kitti[:3, :3]

        # Create KITTI style transforms.
        velo_to_cam_rot = velo_to_cam_kitti[:3, :3]
        velo_to_cam_trans = velo_to_cam_kitti[:3, 3]

        # Check that the rotation has the same format as in KITTI.
        assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1],
                                                      [1, 0, 0]])).all()
        assert (velo_to_cam_trans[1:3] < 0).all()

        # Retrieve the token from the lidar.
        # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar,
        # not the camera.
        filename_cam_full = sd_record_cam['filename']
        filename_lid_full = sd_record_lid['filename']

        if self.is_train:
            for sample_annotation_token in sample_annotation_tokens:

                sample_annotation = self.nusc.get('sample_annotation',
                                                  sample_annotation_token)
                # Convert nuScenes category to nuScenes detection challenge category.
                detection_name = category_to_detection_name(
                    sample_annotation['category_name'])
                if detection_name in self.classes:
                    # Get box in LIDAR frame.
                    _, box_lidar_nusc, _ = self.nusc.get_sample_data(
                        lidar_token,
                        box_vis_level=BoxVisibility.NONE,
                        selected_anntokens=[sample_annotation_token])
                    box_lidar_nusc = box_lidar_nusc[0]

                    # Convert from nuScenes to KITTI box format.
                    box_cam_kitti = KittiDB.box_nuscenes_to_kitti(
                        box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot),
                        velo_to_cam_trans, r0_rect)

                    # Convert quaternion to yaw angle.
                    v = np.dot(box_cam_kitti.rotation_matrix,
                               np.array([1, 0, 0]))
                    yaw = -np.arctan2(v[2], v[0])

                    annotations.append({
                        "class":
                        detection_name,
                        "label":
                        TYPE_ID_CONVERSION[detection_name],
                        "dimensions": [
                            float(box_cam_kitti.wlh[2]),
                            float(box_cam_kitti.wlh[0]),
                            float(box_cam_kitti.wlh[1])
                        ],
                        "locations": [
                            float(box_cam_kitti.center[0]),
                            float(box_cam_kitti.center[1]),
                            float(box_cam_kitti.center[2])
                        ],
                        "rot_y":
                        float(yaw)
                    })

        return annotations, p_left_kitti
    def nuscenes_gt_to_kitti(self) -> None:
        """
        Converts nuScenes GT annotations to KITTI format.
        """
        kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2)
        kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse
        imsize = (1600, 900)

        token_idx = 0  # Start tokens from 0.

        # Get assignment of scenes to splits.
        split_logs = create_splits_logs(self.split, self.nusc)

        # Create output folders.
        label_folder = os.path.join(self.nusc_kitti_dir, self.split, 'label_2')
        calib_folder = os.path.join(self.nusc_kitti_dir, self.split, 'calib')
        image_folder = os.path.join(self.nusc_kitti_dir, self.split, 'image_2')
        lidar_folder = os.path.join(self.nusc_kitti_dir, self.split,
                                    'velodyne')
        for folder in [label_folder, calib_folder, image_folder, lidar_folder]:
            if not os.path.isdir(folder):
                os.makedirs(folder)

        # Use only the samples from the current split.
        sample_tokens = self._split_to_samples(split_logs)
        sample_tokens = sample_tokens[:self.image_count]

        tokens = []
        for sample_token in sample_tokens:

            # Get sample data.
            sample = self.nusc.get('sample', sample_token)
            sample_annotation_tokens = sample['anns']
            cam_front_token = sample['data'][self.cam_name]
            lidar_token = sample['data'][self.lidar_name]

            # Retrieve sensor records.
            sd_record_cam = self.nusc.get('sample_data', cam_front_token)
            sd_record_lid = self.nusc.get('sample_data', lidar_token)
            cs_record_cam = self.nusc.get(
                'calibrated_sensor', sd_record_cam['calibrated_sensor_token'])
            cs_record_lid = self.nusc.get(
                'calibrated_sensor', sd_record_lid['calibrated_sensor_token'])

            # Combine transformations and convert to KITTI format.
            # Note: cam uses same conventions in KITTI and nuScenes.
            lid_to_ego = transform_matrix(cs_record_lid['translation'],
                                          Quaternion(
                                              cs_record_lid['rotation']),
                                          inverse=False)
            ego_to_cam = transform_matrix(cs_record_cam['translation'],
                                          Quaternion(
                                              cs_record_cam['rotation']),
                                          inverse=True)
            velo_to_cam = np.dot(ego_to_cam, lid_to_ego)

            # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam.
            velo_to_cam_kitti = np.dot(velo_to_cam,
                                       kitti_to_nu_lidar.transformation_matrix)

            # Currently not used.
            imu_to_velo_kitti = np.zeros((3, 4))  # Dummy values.
            r0_rect = Quaternion(axis=[1, 0, 0], angle=0)  # Dummy values.

            # Projection matrix.
            p_left_kitti = np.zeros((3, 4))
            p_left_kitti[:3, :3] = cs_record_cam[
                'camera_intrinsic']  # Cameras are always rectified.

            # Create KITTI style transforms.
            velo_to_cam_rot = velo_to_cam_kitti[:3, :3]
            velo_to_cam_trans = velo_to_cam_kitti[:3, 3]

            # Check that the rotation has the same format as in KITTI.
            assert (velo_to_cam_rot.round(0) == np.array([[0, -1,
                                                           0], [0, 0, -1],
                                                          [1, 0, 0]])).all()
            assert (velo_to_cam_trans[1:3] < 0).all()

            # Retrieve the token from the lidar.
            # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar,
            # not the camera.
            filename_cam_full = sd_record_cam['filename']
            filename_lid_full = sd_record_lid['filename']
            # token = '%06d' % token_idx # Alternative to use KITTI names.
            token_idx += 1

            # Convert image (jpg to png).
            src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full)
            dst_im_path = os.path.join(image_folder, sample_token + '.png')
            if not os.path.exists(dst_im_path):
                im = Image.open(src_im_path)
                im.save(dst_im_path, "PNG")

            # Convert lidar.
            # Note that we are only using a single sweep, instead of the commonly used n sweeps.
            src_lid_path = os.path.join(self.nusc.dataroot, filename_lid_full)
            dst_lid_path = os.path.join(lidar_folder, sample_token + '.bin')
            assert not dst_lid_path.endswith('.pcd.bin')
            pcl = LidarPointCloud.from_file(src_lid_path)
            pcl.rotate(
                kitti_to_nu_lidar_inv.rotation_matrix)  # In KITTI lidar frame.
            with open(dst_lid_path, "w") as lid_file:
                pcl.points.T.tofile(lid_file)

            # Add to tokens.
            tokens.append(sample_token)

            # Create calibration file.
            kitti_transforms = dict()
            kitti_transforms['P0'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms['P1'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms['P2'] = p_left_kitti  # Left camera transform.
            kitti_transforms['P3'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms[
                'R0_rect'] = r0_rect.rotation_matrix  # Cameras are already rectified.
            kitti_transforms['Tr_velo_to_cam'] = np.hstack(
                (velo_to_cam_rot, velo_to_cam_trans.reshape(3, 1)))
            kitti_transforms['Tr_imu_to_velo'] = imu_to_velo_kitti
            calib_path = os.path.join(calib_folder, sample_token + '.txt')
            with open(calib_path, "w") as calib_file:
                for (key, val) in kitti_transforms.items():
                    val = val.flatten()
                    val_str = '%.12e' % val[0]
                    for v in val[1:]:
                        val_str += ' %.12e' % v
                    calib_file.write('%s: %s\n' % (key, val_str))

            # Write label file.
            label_path = os.path.join(label_folder, sample_token + '.txt')
            if os.path.exists(label_path):
                print('Skipping existing file: %s' % label_path)
                continue
            else:
                print('Writing file: %s' % label_path)
            with open(label_path, "w") as label_file:
                for sample_annotation_token in sample_annotation_tokens:
                    sample_annotation = self.nusc.get('sample_annotation',
                                                      sample_annotation_token)

                    # Get box in LIDAR frame.
                    _, box_lidar_nusc, _ = self.nusc.get_sample_data(
                        lidar_token,
                        box_vis_level=BoxVisibility.NONE,
                        selected_anntokens=[sample_annotation_token])
                    box_lidar_nusc = box_lidar_nusc[0]

                    # Truncated: Set all objects to 0 which means untruncated.
                    truncated = 0.0

                    # Occluded: Set all objects to full visibility as this information is not available in nuScenes.
                    occluded = 0

                    # Convert nuScenes category to nuScenes detection challenge category.
                    detection_name = category_to_detection_name(
                        sample_annotation['category_name'])

                    # Skip categories that are not part of the nuScenes detection challenge.
                    if detection_name is None:
                        continue

                    # Convert from nuScenes to KITTI box format.
                    box_cam_kitti = KittiDB.box_nuscenes_to_kitti(
                        box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot),
                        velo_to_cam_trans, r0_rect)

                    # Project 3d box to 2d box in image, ignore box if it does not fall inside.
                    bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti,
                                                                 p_left_kitti,
                                                                 imsize=imsize)
                    if bbox_2d is None:
                        continue

                    # Set dummy score so we can use this file as result.
                    box_cam_kitti.score = 0

                    # Convert box to output string format.
                    output = KittiDB.box_to_string(name=detection_name,
                                                   box=box_cam_kitti,
                                                   bbox_2d=bbox_2d,
                                                   truncation=truncated,
                                                   occlusion=occluded)

                    # Write to disk.
                    label_file.write(output + '\n')
Exemple #10
0
    def nuscenes_gt_to_kitti(self) -> None:
        """
        Converts nuScenes GT annotations to KITTI format.
        """
        kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2)
        kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse
        imsize = (1600, 900)

        token_idx = 0  # Start tokens from 0.

        # Get assignment of scenes to splits.
        split_logs = create_splits_logs(self.split, self.nusc)

        scene_splits = create_splits_scenes(verbose=False)
        scene_to_log = {
            scene['name']: self.nusc.get('log', scene['log_token'])['logfile']
            for scene in self.nusc.scene
        }
        logs = set()
        scenes = scene_splits[self.split]
        for scene in scenes:
            logs.add(scene_to_log[scene])
        # print(len(scenes), len(logs))

        split_mapping = {"train": "training", "val": "testing"}

        # Create output folders.
        label_folder = os.path.join(self.nusc_kitti_dir,
                                    split_mapping[self.split], 'label_2')
        calib_folder = os.path.join(self.nusc_kitti_dir,
                                    split_mapping[self.split], 'calib')
        image_folder = os.path.join(self.nusc_kitti_dir,
                                    split_mapping[self.split], 'image_2')
        lidar_folder = os.path.join(self.nusc_kitti_dir,
                                    split_mapping[self.split], 'velodyne')
        for folder in [label_folder, calib_folder, image_folder, lidar_folder]:
            if not os.path.isdir(folder):
                os.makedirs(folder)

        # Use only the samples from the current split.
        sample_tokens = self._split_to_samples(split_logs)
        # sample_tokens = sample_tokens[:self.image_count]

        # print(len(sample_tokens))
        tokens = []
        if self.split == "train":
            split_file = [
                os.path.join(self.nusc_kitti_dir, "train.txt"),
                os.path.join(self.nusc_kitti_dir, "val.txt")
            ]
        elif self.split == 'val':
            split_file = os.path.join(self.nusc_kitti_dir, "test.txt")
        # if os.path.isfile(split_file):
        #     os.remove(split_file)
        if self.split == "train":
            cnt = 0
            with open(split_file[0], "w") as f:
                for seq in list(self.sequence_mapping.keys())[:-150]:
                    for tk in self.sequence_mapping[seq]:
                        f.write("%06d" % tk + "\n")
                        cnt += 1
            # print(cnt)

            cnt = 0
            with open(split_file[1], "w") as f:
                for seq in list(self.sequence_mapping.keys())[-150:]:
                    for tk in self.sequence_mapping[seq]:
                        f.write("%06d" % tk + "\n")
                        cnt += 1
            # print(cnt)
        elif self.split == "val":
            with open(split_file, "w") as f:
                for seq in self.sequence_mapping.keys():
                    for tk in self.sequence_mapping[seq]:
                        f.write("%06d" % tk + "\n")

        for idx, sample_token in enumerate(sample_tokens):

            # Get sample data.
            sample = self.nusc.get('sample', sample_token)
            sample_annotation_tokens = sample['anns']
            cam_front_token = sample['data'][self.cam_name]
            lidar_token = sample['data'][self.lidar_name]
            sample_name = "%06d" % idx

            # Retrieve sensor records.
            sd_record_cam = self.nusc.get('sample_data', cam_front_token)
            sd_record_lid = self.nusc.get('sample_data', lidar_token)
            cs_record_cam = self.nusc.get(
                'calibrated_sensor', sd_record_cam['calibrated_sensor_token'])
            cs_record_lid = self.nusc.get(
                'calibrated_sensor', sd_record_lid['calibrated_sensor_token'])

            # Combine transformations and convert to KITTI format.
            # Note: cam uses same conventions in KITTI and nuScenes.
            lid_to_ego = transform_matrix(cs_record_lid['translation'],
                                          Quaternion(
                                              cs_record_lid['rotation']),
                                          inverse=False)
            ego_to_cam = transform_matrix(cs_record_cam['translation'],
                                          Quaternion(
                                              cs_record_cam['rotation']),
                                          inverse=True)
            velo_to_cam = np.dot(ego_to_cam, lid_to_ego)

            # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam.
            velo_to_cam_kitti = np.dot(velo_to_cam,
                                       kitti_to_nu_lidar.transformation_matrix)

            # Currently not used.
            imu_to_velo_kitti = np.zeros((3, 4))  # Dummy values.
            r0_rect = Quaternion(axis=[1, 0, 0], angle=0)  # Dummy values.

            # Projection matrix.
            p_left_kitti = np.zeros((3, 4))
            p_left_kitti[:3, :3] = cs_record_cam[
                'camera_intrinsic']  # Cameras are always rectified.

            # Create KITTI style transforms.
            velo_to_cam_rot = velo_to_cam_kitti[:3, :3]
            velo_to_cam_trans = velo_to_cam_kitti[:3, 3]

            # Check that the rotation has the same format as in KITTI.
            assert (velo_to_cam_rot.round(0) == np.array([[0, -1,
                                                           0], [0, 0, -1],
                                                          [1, 0, 0]])).all()
            assert (velo_to_cam_trans[1:3] < 0).all()

            # Retrieve the token from the lidar.
            # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar,
            # not the camera.
            filename_cam_full = sd_record_cam['filename']
            filename_lid_full = sd_record_lid['filename']
            # token = '%06d' % token_idx # Alternative to use KITTI names.
            token_idx += 1

            # Convert image (jpg to png).
            src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full)
            dst_im_path = os.path.join(image_folder, sample_name + '.png')
            if not os.path.exists(dst_im_path):
                im = Image.open(src_im_path)
                im.save(dst_im_path, "PNG")

            # Convert lidar.
            # Note that we are only using a single sweep, instead of the commonly used n sweeps.
            src_lid_path = os.path.join(self.nusc.dataroot, filename_lid_full)
            dst_lid_path = os.path.join(lidar_folder, sample_name + '.bin')
            assert not dst_lid_path.endswith('.pcd.bin')
            pcl = LidarPointCloud.from_file(src_lid_path)
            # pcl, _ = LidarPointCloud.from_file_multisweep_future(self.nusc, sample, self.lidar_name, self.lidar_name, nsweeps=5)
            pcl.rotate(
                kitti_to_nu_lidar_inv.rotation_matrix)  # In KITTI lidar frame.
            with open(dst_lid_path, "w") as lid_file:
                pcl.points.T.tofile(lid_file)

            # Add to tokens.
            tokens.append(sample_token)

            # Create calibration file.
            kitti_transforms = dict()
            kitti_transforms['P0'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms['P1'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms['P2'] = p_left_kitti  # Left camera transform.
            kitti_transforms['P3'] = np.zeros((3, 4))  # Dummy values.
            kitti_transforms[
                'R0_rect'] = r0_rect.rotation_matrix  # Cameras are already rectified.
            kitti_transforms['Tr_velo_to_cam'] = np.hstack(
                (velo_to_cam_rot, velo_to_cam_trans.reshape(3, 1)))
            kitti_transforms['Tr_imu_to_velo'] = imu_to_velo_kitti
            calib_path = os.path.join(calib_folder, sample_name + '.txt')
            with open(calib_path, "w") as calib_file:
                for (key, val) in kitti_transforms.items():
                    val = val.flatten()
                    val_str = '%.12e' % val[0]
                    for v in val[1:]:
                        val_str += ' %.12e' % v
                    calib_file.write('%s: %s\n' % (key, val_str))

            # Write label file.
            label_path = os.path.join(label_folder, sample_name + '.txt')
            if os.path.exists(label_path):
                # print('Skipping existing file: %s' % label_path)
                continue
            # else:
            #     print('Writing file: %s' % label_path)

            objects = []
            for sample_annotation_token in sample_annotation_tokens:
                sample_annotation = self.nusc.get('sample_annotation',
                                                  sample_annotation_token)

                # Get box in LIDAR frame.
                _, box_lidar_nusc, _ = self.nusc.get_sample_data(
                    lidar_token,
                    box_vis_level=BoxVisibility.NONE,
                    selected_anntokens=[sample_annotation_token])
                box_lidar_nusc = box_lidar_nusc[0]

                # Truncated: Set all objects to 0 which means untruncated.
                truncated = 0.0

                # Occluded: Set all objects to full visibility as this information is not available in nuScenes.
                occluded = 0

                obj = dict()

                # Convert nuScenes category to nuScenes detection challenge category.
                obj["detection_name"] = category_to_detection_name(
                    sample_annotation['category_name'])

                # Skip categories that are not part of the nuScenes detection challenge.
                if obj["detection_name"] is None or obj[
                        "detection_name"] not in CLASS_MAP.keys():
                    continue

                obj["detection_name"] = CLASS_MAP[obj["detection_name"]]

                # Convert from nuScenes to KITTI box format.
                obj["box_cam_kitti"] = KittiDB.box_nuscenes_to_kitti(
                    box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot),
                    velo_to_cam_trans, r0_rect)

                # Project 3d box to 2d box in image, ignore box if it does not fall inside.
                bbox_2d = project_to_2d(obj["box_cam_kitti"], p_left_kitti,
                                        imsize[1], imsize[0])
                if bbox_2d is None:
                    continue
                obj["bbox_2d"] = bbox_2d["bbox"]
                obj["truncated"] = bbox_2d["truncated"]

                # Set dummy score so we can use this file as result.
                obj["box_cam_kitti"].score = 0

                v = np.dot(obj["box_cam_kitti"].rotation_matrix,
                           np.array([1, 0, 0]))
                rot_y = -np.arctan2(v[2], v[0])
                obj["alpha"] = -np.arctan2(
                    obj["box_cam_kitti"].center[0],
                    obj["box_cam_kitti"].center[2]) + rot_y
                obj["depth"] = np.linalg.norm(
                    np.array(obj["box_cam_kitti"].center[:3]))
                objects.append(obj)

            objects = postprocessing(objects, imsize[1], imsize[0])

            with open(label_path, "w") as label_file:
                for obj in objects:
                    # Convert box to output string format.
                    output = box_to_string(name=obj["detection_name"],
                                           box=obj["box_cam_kitti"],
                                           bbox_2d=obj["bbox_2d"],
                                           truncation=obj["truncated"],
                                           occlusion=obj["occluded"],
                                           alpha=obj["alpha"])
                    label_file.write(output + '\n')
Exemple #11
0
def compute_labels_image(nusc, sample, sensor, nu_to_kitti_lidar, p):
    resolution = p['pointcloud_grid_map_interface']['grids']['cartesian']['resolution']['x']
    if resolution - p['pointcloud_grid_map_interface']['grids']['cartesian']['resolution']['y'] > 0.001:
        raise ValueError('Grid Map resolution in x and y direction need to be equal')
    length = p['pointcloud_grid_map_interface']['grids']['cartesian']['range']['x']
    width = p['pointcloud_grid_map_interface']['grids']['cartesian']['range']['y']
    grid_map_origin_idx = np.array(
        [length / 2 + p['pointcloud_grid_map_interface']['grids']['cartesian']['offset']['x'],
         width / 2])
    labels_corners = []
    labels_center = []
    labels_data = []
    for annotation_token in sample['anns']:
        annotation_metadata = nusc.get('sample_annotation', annotation_token)
        if annotation_metadata['num_lidar_pts'] == 0:
            continue
        _, box_lidar, _ = nusc.get_sample_data(sample['data'][sensor], box_vis_level=BoxVisibility.NONE,
                                               selected_anntokens=[annotation_token])
        box_lidar = box_lidar[0]
        box_lidar.rotate(nu_to_kitti_lidar)
        detection_name = category_to_detection_name(annotation_metadata['category_name'])
        if detection_name is None:
            continue
        # corners_obj: 4 * 8 matrix, each clomun indicates a corner (l, w, h) of a 3d bounding box
        corners_obj = np.array(
            [[box_lidar.wlh[1] / 2, box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2,
              box_lidar.wlh[1] / 2, box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2],
             [box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, box_lidar.wlh[0] / 2,
              box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, -box_lidar.wlh[0] / 2, box_lidar.wlh[0] / 2],
             [box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2,
              - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2],
             [1, 1, 1, 1, 1, 1, 1, 1]])

        # tf_velo: box 3d pose affine transformation with respect to lidar origin
        tf_velo = np.array(
            [[box_lidar.rotation_matrix[0][0], box_lidar.rotation_matrix[0][1], box_lidar.rotation_matrix[0][2],
              box_lidar.center[0]],
             [box_lidar.rotation_matrix[1][0], box_lidar.rotation_matrix[1][1], box_lidar.rotation_matrix[1][2],
              box_lidar.center[1]],
             [box_lidar.rotation_matrix[2][0], box_lidar.rotation_matrix[2][1], box_lidar.rotation_matrix[2][2],
              box_lidar.center[2]],
             [0, 0, 0, 1]])

        # tf_velo_to_image: affine transformation from lidar coordinate to image coordinate
        tf_velo_to_image = np.array([[0, -1, grid_map_origin_idx[1]], [-1, 0, grid_map_origin_idx[0]], [0, 0, 1]])

        # corners_velo: corners in lidar coordinate system
        # corners_velo_x_y: corners_velo in x-y BEV plane
        corners_velo = tf_velo.dot(corners_obj)
        corners_velo_x_y = np.array([corners_velo[0], corners_velo[1], [1, 1, 1, 1, 1, 1, 1, 1]])

        # corners_image: corners in grid map with unit meter
        # corners_image_idx: 8 * 2 matrix, corners in grid map with unit pixel
        corners_image = tf_velo_to_image.dot(corners_velo_x_y)
        corners_image_idx = np.array([corners_image[0] / resolution, corners_image[1] / resolution])

        # label_t_image_normalized: 2 * 1 vector, box center in grid map with unit pixel
        label_t_velo = np.array([box_lidar.center[0], box_lidar.center[1], box_lidar.center[2], 1])
        label_t_image = tf_velo_to_image.dot(np.array([label_t_velo[0], label_t_velo[1], 1]))
        label_t_image_normalized = np.array([label_t_image[0] / width, label_t_image[1] / length])

        # Convert rotation matrix to yaw angle
        v = np.dot(box_lidar.rotation_matrix, np.array([1, 0, 0]))
        yaw = np.arctan2(v[1], v[0])

        if 0 <= min(corners_image_idx[0]) \
                and max(corners_image_idx[0]) < width / resolution \
                and 0 <= min(corners_image_idx[1]) \
                and max(corners_image_idx[1]) < length / resolution:
            labels_corners.append(corners_image_idx)
            labels_center.append(label_t_image_normalized)
            labels_data.append(Label(type=detection_name,
                                     l=box_lidar.wlh[1],
                                     w=box_lidar.wlh[0],
                                     h=box_lidar.wlh[2],
                                     rz=yaw))
    return labels_corners, labels_center, labels_data
Exemple #12
0
def load_merge_from_pkl(nusc: NuScenes,
                        pkl_path: str,
                        box_cls,
                        verbose: bool = False) -> EvalBoxes:
    # Init.
    if box_cls == DetectionBox:
        attribute_map = {a['token']: a['name'] for a in nusc.attribute}

    if verbose:
        print('Loading annotations for {} split from nuScenes version: {}'.
              format(pkl_path, nusc.version))

    import mmcv
    infos = mmcv.load(pkl_path)['infos']
    samples = []
    for info in infos:
        samples.append(nusc.get('sample', info['token']))
    all_annotations = EvalBoxes()

    # Load annotations and filter predictions and annotations.
    merge_map = dict(car='vehicle',
                     truck='vehicle',
                     bus='vehicle',
                     trailer='vehicle',
                     construction_vehicle='vehicle',
                     pedestrian='pedestrian',
                     motorcycle='bike',
                     bicycle='bike',
                     traffic_cone='traffic_boundary',
                     barrier='traffic_boundary')
    for sample in tqdm.tqdm(samples, leave=verbose):
        sample_token = sample['token']
        cam_token = sample['data']['CAM_FRONT']
        _, boxes_cam, _ = nusc.get_sample_data(cam_token)
        sample_annotation_tokens = [box.token for box in boxes_cam]

        # sample = nusc.get('sample', sample_token)
        # sample_annotation_tokens = sample['anns']

        sample_boxes = []
        for sample_annotation_token in sample_annotation_tokens:

            sample_annotation = nusc.get('sample_annotation',
                                         sample_annotation_token)
            if box_cls == DetectionBox:
                # Get label name in detection task and filter unused labels.
                detection_name = category_to_detection_name(
                    sample_annotation['category_name'])
                if detection_name is None:
                    continue
                detection_name = merge_map[detection_name]

                # Get attribute_name.
                attr_tokens = sample_annotation['attribute_tokens']
                attr_count = len(attr_tokens)
                if attr_count == 0:
                    attribute_name = ''
                elif attr_count == 1:
                    attribute_name = attribute_map[attr_tokens[0]]
                else:
                    raise Exception(
                        'Error: GT annotations must not have more than one attribute!'
                    )

                sample_boxes.append(
                    box_cls(
                        sample_token=sample_token,
                        translation=sample_annotation['translation'],
                        size=sample_annotation['size'],
                        rotation=sample_annotation['rotation'],
                        velocity=nusc.box_velocity(
                            sample_annotation['token'])[:2],
                        num_pts=sample_annotation['num_lidar_pts'] +
                        sample_annotation['num_radar_pts'],
                        detection_name=detection_name,
                        detection_score=-1.0,  # GT samples do not have a score.
                        attribute_name=attribute_name))
            else:
                raise NotImplementedError('Error: Invalid box_cls %s!' %
                                          box_cls)

        all_annotations.add_boxes(sample_token, sample_boxes)

    if verbose:
        print("Loaded ground truth annotations for {} samples.".format(
            len(all_annotations.sample_tokens)))

    return all_annotations
Exemple #13
0
def main():
    SCENE_SPLITS["mini-val"] = SCENE_SPLITS["val"]
    if not os.path.exists(DATA_PATH):
        os.mkdir(DATA_PATH)
    if not os.path.exists(OUT_PATH):
        os.mkdir(OUT_PATH)
    for split in SPLITS:
        data_path = DATA_PATH  # + '{}/'.format(SPLITS[split])
        nusc = NuScenes(version=SPLITS[split],
                        dataroot=data_path,
                        verbose=True)
        out_path = OUT_PATH + "{}.json".format(split)
        categories_info = [{
            "name": CATS[i],
            "id": i + 1
        } for i in range(len(CATS))]
        ret = {
            "images": [],
            "annotations": [],
            "categories": categories_info,
            "videos": [],
            "attributes": ATTRIBUTE_TO_ID,
        }
        num_images = 0
        num_anns = 0
        num_videos = 0

        # A "sample" in nuScenes refers to a timestamp with 6 cameras and 1 LIDAR.
        for sample in nusc.sample:
            scene_name = nusc.get("scene", sample["scene_token"])["name"]
            if not (split in ["mini", "test"
                              ]) and not (scene_name in SCENE_SPLITS[split]):
                continue
            if sample["prev"] == "":
                print("scene_name", scene_name)
                num_videos += 1
                ret["videos"].append({
                    "id": num_videos,
                    "file_name": scene_name
                })
                frame_ids = {k: 0 for k in sample["data"]}
                track_ids = {}
            # We decompose a sample into 6 images in our case.
            for sensor_name in sample["data"]:
                if sensor_name in USED_SENSOR:
                    image_token = sample["data"][sensor_name]
                    image_data = nusc.get("sample_data", image_token)
                    num_images += 1

                    # Complex coordinate transform. This will take time to understand.
                    sd_record = nusc.get("sample_data", image_token)
                    cs_record = nusc.get("calibrated_sensor",
                                         sd_record["calibrated_sensor_token"])
                    pose_record = nusc.get("ego_pose",
                                           sd_record["ego_pose_token"])
                    global_from_car = transform_matrix(
                        pose_record["translation"],
                        Quaternion(pose_record["rotation"]),
                        inverse=False,
                    )
                    car_from_sensor = transform_matrix(
                        cs_record["translation"],
                        Quaternion(cs_record["rotation"]),
                        inverse=False,
                    )
                    trans_matrix = np.dot(global_from_car, car_from_sensor)
                    _, boxes, camera_intrinsic = nusc.get_sample_data(
                        image_token, box_vis_level=BoxVisibility.ANY)
                    calib = np.eye(4, dtype=np.float32)
                    calib[:3, :3] = camera_intrinsic
                    calib = calib[:3]
                    frame_ids[sensor_name] += 1

                    # image information in COCO format
                    image_info = {
                        "id": num_images,
                        "file_name": image_data["filename"],
                        "calib": calib.tolist(),
                        "video_id": num_videos,
                        "frame_id": frame_ids[sensor_name],
                        "sensor_id": SENSOR_ID[sensor_name],
                        "sample_token": sample["token"],
                        "trans_matrix": trans_matrix.tolist(),
                        "width": sd_record["width"],
                        "height": sd_record["height"],
                        "pose_record_trans": pose_record["translation"],
                        "pose_record_rot": pose_record["rotation"],
                        "cs_record_trans": cs_record["translation"],
                        "cs_record_rot": cs_record["rotation"],
                    }
                    ret["images"].append(image_info)
                    anns = []
                    for box in boxes:
                        det_name = category_to_detection_name(box.name)
                        if det_name is None:
                            continue
                        num_anns += 1
                        v = np.dot(box.rotation_matrix, np.array([1, 0, 0]))
                        yaw = -np.arctan2(v[2], v[0])
                        box.translate(np.array([0, box.wlh[2] / 2, 0]))
                        category_id = CAT_IDS[det_name]

                        amodel_center = project_to_image(
                            np.array(
                                [
                                    box.center[0],
                                    box.center[1] - box.wlh[2] / 2,
                                    box.center[2],
                                ],
                                np.float32,
                            ).reshape(1, 3),
                            calib,
                        )[0].tolist()
                        sample_ann = nusc.get("sample_annotation", box.token)
                        instance_token = sample_ann["instance_token"]
                        if not (instance_token in track_ids):
                            track_ids[instance_token] = len(track_ids) + 1
                        attribute_tokens = sample_ann["attribute_tokens"]
                        attributes = [
                            nusc.get("attribute", att_token)["name"]
                            for att_token in attribute_tokens
                        ]
                        att = "" if len(attributes) == 0 else attributes[0]
                        if len(attributes) > 1:
                            print(attributes)
                            import pdb

                            pdb.set_trace()
                        track_id = track_ids[instance_token]
                        vel = nusc.box_velocity(box.token)  # global frame
                        vel = np.dot(
                            np.linalg.inv(trans_matrix),
                            np.array([vel[0], vel[1], vel[2], 0], np.float32),
                        ).tolist()

                        # instance information in COCO format
                        ann = {
                            "id":
                            num_anns,
                            "image_id":
                            num_images,
                            "category_id":
                            category_id,
                            "dim": [box.wlh[2], box.wlh[0], box.wlh[1]],
                            "location":
                            [box.center[0], box.center[1], box.center[2]],
                            "depth":
                            box.center[2],
                            "occluded":
                            0,
                            "truncated":
                            0,
                            "rotation_y":
                            yaw,
                            "amodel_center":
                            amodel_center,
                            "iscrowd":
                            0,
                            "track_id":
                            track_id,
                            "attributes":
                            ATTRIBUTE_TO_ID[att],
                            "velocity":
                            vel,
                        }

                        bbox = KittiDB.project_kitti_box_to_image(
                            copy.deepcopy(box),
                            camera_intrinsic,
                            imsize=(1600, 900))
                        alpha = _rot_y2alpha(
                            yaw,
                            (bbox[0] + bbox[2]) / 2,
                            camera_intrinsic[0, 2],
                            camera_intrinsic[0, 0],
                        )
                        ann["bbox"] = [
                            bbox[0],
                            bbox[1],
                            bbox[2] - bbox[0],
                            bbox[3] - bbox[1],
                        ]
                        ann["area"] = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                        ann["alpha"] = alpha
                        anns.append(ann)

                    # Filter out bounding boxes outside the image
                    visable_anns = []
                    for i in range(len(anns)):
                        vis = True
                        for j in range(len(anns)):
                            if anns[i]["depth"] - min(anns[i][
                                    "dim"]) / 2 > anns[j]["depth"] + max(
                                        anns[j]["dim"]) / 2 and _bbox_inside(
                                            anns[i]["bbox"], anns[j]["bbox"]):
                                vis = False
                                break
                        if vis:
                            visable_anns.append(anns[i])
                        else:
                            pass

                    for ann in visable_anns:
                        ret["annotations"].append(ann)
                    if DEBUG:
                        img_path = data_path + image_info["file_name"]
                        img = cv2.imread(img_path)
                        img_3d = img.copy()
                        for ann in visable_anns:
                            bbox = ann["bbox"]
                            cv2.rectangle(
                                img,
                                (int(bbox[0]), int(bbox[1])),
                                (int(bbox[2] + bbox[0]),
                                 int(bbox[3] + bbox[1])),
                                (0, 0, 255),
                                3,
                                lineType=cv2.LINE_AA,
                            )
                            box_3d = compute_box_3d(ann["dim"],
                                                    ann["location"],
                                                    ann["rotation_y"])
                            box_2d = project_to_image(box_3d, calib)
                            img_3d = draw_box_3d(img_3d, box_2d)

                            pt_3d = unproject_2d_to_3d(ann["amodel_center"],
                                                       ann["depth"], calib)
                            pt_3d[1] += ann["dim"][0] / 2
                            print("location", ann["location"])
                            print("loc model", pt_3d)
                            pt_2d = np.array(
                                [(bbox[0] + bbox[2]) / 2,
                                 (bbox[1] + bbox[3]) / 2],
                                dtype=np.float32,
                            )
                            pt_3d = unproject_2d_to_3d(pt_2d, ann["depth"],
                                                       calib)
                            pt_3d[1] += ann["dim"][0] / 2
                            print("loc      ", pt_3d)
                        cv2.imshow("img", img)
                        cv2.imshow("img_3d", img_3d)
                        cv2.waitKey()
                        nusc.render_sample_data(image_token)
                        plt.show()
        print("reordering images")
        images = ret["images"]
        video_sensor_to_images = {}
        for image_info in images:
            tmp_seq_id = image_info["video_id"] * 20 + image_info["sensor_id"]
            if tmp_seq_id in video_sensor_to_images:
                video_sensor_to_images[tmp_seq_id].append(image_info)
            else:
                video_sensor_to_images[tmp_seq_id] = [image_info]
        ret["images"] = []
        for tmp_seq_id in sorted(video_sensor_to_images):
            ret["images"] = ret["images"] + video_sensor_to_images[tmp_seq_id]

        print("{} {} images {} boxes".format(split, len(ret["images"]),
                                             len(ret["annotations"])))
        print("out_path", out_path)
        json.dump(ret, open(out_path, "w"))
Exemple #14
0
def rasterize(ref_sample):

    obj_box_list = []
    obj_shadow_list = []

    ref_sample_token = ref_sample["token"]

    ref_lidar_data = nusc.get("sample_data", ref_sample["data"]["LIDAR_TOP"])
    ref_lidar_calib = nusc.get("calibrated_sensor", ref_lidar_data["calibrated_sensor_token"])
    ref_lidar_pose = nusc.get("ego_pose", ref_lidar_data["ego_pose_token"])

    ref_from_car = transform_matrix(ref_lidar_calib["translation"],
                                    Quaternion(ref_lidar_calib["rotation"]), inverse=True)
    car_from_global = transform_matrix(ref_lidar_pose["translation"],
                                       Quaternion(ref_lidar_pose["rotation"]), inverse=True)

    next_sample_tokens = traverse(nusc, "sample", "next", n_next, ref_sample_token)
    for curr_sample_token in ([ref_sample_token] + next_sample_tokens):
        curr_sample = nusc.get("sample", curr_sample_token)

        curr_sample_boxes = []

        for sample_annotation_token in curr_sample["anns"]:

            sample_annotation = nusc.get("sample_annotation", sample_annotation_token)

            detection_name = category_to_detection_name(sample_annotation["category_name"])
            if detection_name is None:  # there are certain categories we will ignore
                continue

            # print(sample_annotation["category_name"], detection_name)
            curr_sample_boxes.append(DetectionBox(
                sample_token=curr_sample_token,
                translation=sample_annotation["translation"],
                size=sample_annotation["size"],
                rotation=sample_annotation["rotation"],
                velocity=nusc.box_velocity(sample_annotation["token"])[:2],
                num_pts=sample_annotation["num_lidar_pts"] + sample_annotation["num_radar_pts"],
                detection_name=detection_name,
            ))

        # NOTE transform boxes to the *reference* frame
        curr_sample_boxes = boxes_to_sensor(curr_sample_boxes, ref_lidar_pose, ref_lidar_calib)

        # NOTE object box binary masks
        curr_obj_box = draw_obj_boxes(curr_sample_boxes, xlim, ylim)
        obj_box_list.append(curr_obj_box)

        curr_sample_data = nusc.get("sample_data", curr_sample["data"]["LIDAR_TOP"])
        curr_lidar_pose = nusc.get("ego_pose", curr_sample_data["ego_pose_token"])
        curr_lidar_calib = nusc.get("calibrated_sensor", curr_sample_data["calibrated_sensor_token"])

        global_from_car = transform_matrix(curr_lidar_pose["translation"],
                                           Quaternion(curr_lidar_pose["rotation"]), inverse=False)
        car_from_curr = transform_matrix(curr_lidar_calib["translation"],
                                         Quaternion(curr_lidar_calib["rotation"]), inverse=False)

        ref_from_curr = reduce(np.dot, [ref_from_car, car_from_global, global_from_car, car_from_curr])
        _x0, _y0, _z0 = ref_from_curr[:3, 3]

        curr_obj_shadow = draw_object_shadow(curr_sample_boxes, _x0, _y0, xlim, ylim)
        obj_shadow_list.append(curr_obj_shadow)

    obj_boxes = np.array(obj_box_list)
    obj_shadows = np.array(obj_shadow_list)

    ref_scene = nusc.get("scene", ref_sample["scene_token"])
    ref_log = nusc.get("log", ref_scene["log_token"])
    flip_flag = True if ref_log["location"].startswith("singapore") else False

    if flip_flag:
        obj_boxes = obj_boxes[:, :, ::-1]
        obj_shadows = obj_shadows[:, :, ::-1]

    ref_lidar_token = ref_lidar_data["token"]

    obj_box_path = f"{obj_box_dir}/{ref_lidar_token}.bin"
    if not os.path.exists(obj_box_path):
        obj_boxes.tofile(obj_box_path)

    obj_shadow_path = f"{obj_shadow_dir}/{ref_lidar_token}.bin"
    if not os.path.exists(obj_shadow_path):
        obj_shadows.tofile(obj_shadow_path)
Exemple #15
0
            lidar_to_ego = transform_matrix(calib_sensor['translation'], Quaternion(calib_sensor['rotation']))
            ego_to_global_prev = transform_matrix(ego_pose_prev['translation'], Quaternion(ego_pose_prev['rotation']))
            lidar_to_ego_prev = transform_matrix(calib_sensor_prev['translation'],
                                                 Quaternion(calib_sensor_prev['rotation']))
            lidar_to_global = np.dot(ego_to_global, lidar_to_ego)
            lidar_to_global_prev = np.dot(ego_to_global_prev, lidar_to_ego_prev)
            delta = inv(lidar_to_global_prev).dot(lidar_to_global)
            y_translation.append(delta[1, 3])
            delta_angles = rotationMatrixToEulerAngles(delta[0:3, 0:3]) * 180 / np.pi
            yaw.append(delta_angles[2])

        for annotation_token in sample['anns']:
            annotation_metadata = nusc.get('sample_annotation', annotation_token)
            if annotation_metadata['num_lidar_pts'] == 0:
                continue
            detection_name = category_to_detection_name(annotation_metadata['category_name'])
            if detection_name is None:
                continue
            _, box_lidar, _ = nusc.get_sample_data(sample['data'][sensor], box_vis_level=BoxVisibility.NONE,
                                                   selected_anntokens=[annotation_token])
            box_lidar = box_lidar[0]
            box_lidar.rotate(nu_to_kitti_lidar)
            if x_positions[detection_name] is None:
                x_positions[detection_name] = [box_lidar.center[0]]
                y_positions[detection_name] = [box_lidar.center[1]]
                length[detection_name] = [box_lidar.wlh[1]]
                width[detection_name] = [box_lidar.wlh[0]]
                num_lidar_points[detection_name] = [annotation_metadata['num_lidar_pts']]
            else:
                x_positions[detection_name].append(box_lidar.center[0])
                y_positions[detection_name].append(box_lidar.center[1])
Exemple #16
0
    def __getitem__(self, idx):
        # set defaults here
        kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2)
        kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse
        imsize = (1600, 900)

        # sample_token based on index
        sample_token = self.sample_tokens[idx]

        # Get sample data.
        sample = self.nusc.get('sample', sample_token)
        sample_annotation_tokens = sample['anns']
        cam_front_token = sample['data'][self.cam_name]
        lidar_token = sample['data'][self.lidar_name]

        # Retrieve sensor records.
        sd_record_cam = self.nusc.get('sample_data', cam_front_token)
        sd_record_lid = self.nusc.get('sample_data', lidar_token)
        cs_record_cam = self.nusc.get('calibrated_sensor',
                                      sd_record_cam['calibrated_sensor_token'])
        cs_record_lid = self.nusc.get('calibrated_sensor',
                                      sd_record_lid['calibrated_sensor_token'])

        # Combine transformations and convert to KITTI format.
        # Note: cam uses same conventions in KITTI and nuScenes.
        lid_to_ego = transform_matrix(cs_record_lid['translation'],
                                      Quaternion(cs_record_lid['rotation']),
                                      inverse=False)
        ego_to_cam = transform_matrix(cs_record_cam['translation'],
                                      Quaternion(cs_record_cam['rotation']),
                                      inverse=True)
        velo_to_cam = np.dot(ego_to_cam, lid_to_ego)

        # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam.
        velo_to_cam_kitti = np.dot(velo_to_cam,
                                   kitti_to_nu_lidar.transformation_matrix)

        r0_rect = Quaternion(axis=[1, 0, 0], angle=0)  # Dummy values.

        # Projection matrix.
        p_left_kitti = np.zeros((3, 4))
        p_left_kitti[:3, :3] = cs_record_cam[
            'camera_intrinsic']  # Cameras are always rectified.

        # Create KITTI style transforms.
        velo_to_cam_rot = velo_to_cam_kitti[:3, :3]
        velo_to_cam_trans = velo_to_cam_kitti[:3, 3]

        # Check that the rotation has the same format as in KITTI.
        assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1],
                                                      [1, 0, 0]])).all()
        assert (velo_to_cam_trans[1:3] < 0).all()

        # Retrieve the token from the lidar.
        # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar,
        # not the camera.
        filename_cam_full = sd_record_cam['filename']

        # set the img variable to its data here
        src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full)
        img = Image.open(src_im_path)

        # Create calibration matrix.
        K = p_left_kitti
        K = [float(i) for i in K]
        K = np.array(K, dtype=np.float32).reshape(3, 4)
        K = K[:3, :3]

        # populate the list of object annotations for this sample
        anns = []
        for sample_annotation_token in sample_annotation_tokens:
            sample_annotation = self.nusc.get('sample_annotation',
                                              sample_annotation_token)

            # Get box in LIDAR frame.
            _, box_lidar_nusc, _ = self.nusc.get_sample_data(
                lidar_token,
                box_vis_level=BoxVisibility.NONE,
                selected_anntokens=[sample_annotation_token])
            box_lidar_nusc = box_lidar_nusc[0]

            # Truncated: Set all objects to 0 which means untruncated.
            truncated = 0.0

            # Occluded: Set all objects to full visibility as this information is not available in nuScenes.
            occluded = 0

            # Convert nuScenes category to nuScenes detection challenge category.
            detection_name = category_to_detection_name(
                sample_annotation['category_name'])

            # Skip categories that are not part of the nuScenes detection challenge.
            if detection_name is None:
                continue

            # Convert from nuScenes to KITTI box format.
            box_cam_kitti = KittiDB.box_nuscenes_to_kitti(
                box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot),
                velo_to_cam_trans, r0_rect)

            # Project 3d box to 2d box in image, ignore box if it does not fall inside.
            bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti,
                                                         p_left_kitti,
                                                         imsize=imsize)
            if bbox_2d is None:
                continue

            # Set dummy score so we can use this file as result.
            box_cam_kitti.score = 0

            # Convert box to output string format.
            output = KittiDB.box_to_string(name=detection_name,
                                           box=box_cam_kitti,
                                           bbox_2d=bbox_2d,
                                           truncation=truncated,
                                           occlusion=occluded)
            fieldnames = [
                'type', 'truncated', 'occluded', 'alpha', 'xmin', 'ymin',
                'xmax', 'ymax', 'dh', 'dw', 'dl', 'lx', 'ly', 'lz', 'ry'
            ]
            if self.is_train:
                f = StringIO(output)
                reader = csv.DictReader(f,
                                        delimiter=' ',
                                        fieldnames=fieldnames)
                for line, row in enumerate(reader):
                    if row["type"] in self.classes:
                        anns.append({
                            "class":
                            row["type"],
                            "label":
                            TYPE_ID_CONVERSION[row["type"]],
                            "truncation":
                            float(row["truncated"]),
                            "occlusion":
                            float(row["occluded"]),
                            "alpha":
                            float(row["alpha"]),
                            "dimensions": [
                                float(row['dl']),
                                float(row['dh']),
                                float(row['dw'])
                            ],
                            "locations": [
                                float(row['lx']),
                                float(row['ly']),
                                float(row['lz'])
                            ],
                            "rot_y":
                            float(row["ry"])
                        })

        center = np.array([i / 2 for i in img.size], dtype=np.float32)
        size = np.array([i for i in img.size], dtype=np.float32)
        """
        resize, horizontal flip, and affine augmentation are performed here.
        since it is complicated to compute heatmap w.r.t transform.
        """
        flipped = False
        if (self.is_train) and (random.random() < self.flip_prob):
            flipped = True
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
            center[0] = size[0] - center[0] - 1
            K[0, 2] = size[0] - K[0, 2] - 1

        affine = False
        if (self.is_train) and (random.random() < self.aug_prob):
            affine = True
            shift, scale = self.shift_scale[0], self.shift_scale[1]
            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
            center[0] += size[0] * random.choice(shift_ranges)
            center[1] += size[1] * random.choice(shift_ranges)

            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
            size *= random.choice(scale_ranges)

        center_size = [center, size]
        trans_affine = get_transfrom_matrix(
            center_size, [self.input_width, self.input_height])
        trans_affine_inv = np.linalg.inv(trans_affine)
        img = img.transform(
            (self.input_width, self.input_height),
            method=Image.AFFINE,
            data=trans_affine_inv.flatten()[:6],
            resample=Image.BILINEAR,
        )

        trans_mat = get_transfrom_matrix(
            center_size, [self.output_width, self.output_height])

        if not self.is_train:
            # for inference we parametrize with original size
            target = ParamsList(image_size=size, is_train=self.is_train)
            target.add_field("trans_mat", trans_mat)
            target.add_field("K", K)
            if self.transforms is not None:
                img, target = self.transforms(img, target)

            return img, target, idx

        heat_map = np.zeros(
            [self.num_classes, self.output_height, self.output_width],
            dtype=np.float32)
        regression = np.zeros([self.max_objs, 3, 8], dtype=np.float32)
        cls_ids = np.zeros([self.max_objs], dtype=np.int32)
        proj_points = np.zeros([self.max_objs, 2], dtype=np.int32)
        p_offsets = np.zeros([self.max_objs, 2], dtype=np.float32)
        dimensions = np.zeros([self.max_objs, 3], dtype=np.float32)
        locations = np.zeros([self.max_objs, 3], dtype=np.float32)
        rotys = np.zeros([self.max_objs], dtype=np.float32)
        reg_mask = np.zeros([self.max_objs], dtype=np.uint8)
        flip_mask = np.zeros([self.max_objs], dtype=np.uint8)

        for i, a in enumerate(anns):
            a = a.copy()
            cls = a["label"]

            locs = np.array(a["locations"])
            rot_y = np.array(a["rot_y"])
            if flipped:
                locs[0] *= -1
                rot_y *= -1

            point, box2d, box3d = encode_label(K, rot_y, a["dimensions"], locs)
            point = affine_transform(point, trans_mat)
            box2d[:2] = affine_transform(box2d[:2], trans_mat)
            box2d[2:] = affine_transform(box2d[2:], trans_mat)
            box2d[[0, 2]] = box2d[[0, 2]].clip(0, self.output_width - 1)
            box2d[[1, 3]] = box2d[[1, 3]].clip(0, self.output_height - 1)
            h, w = box2d[3] - box2d[1], box2d[2] - box2d[0]

            if (0 < point[0] < self.output_width) and (0 < point[1] <
                                                       self.output_height):
                point_int = point.astype(np.int32)
                p_offset = point - point_int
                radius = gaussian_radius(h, w)
                radius = max(0, int(radius))
                heat_map[cls] = draw_umich_gaussian(heat_map[cls], point_int,
                                                    radius)

                cls_ids[i] = cls
                regression[i] = box3d
                proj_points[i] = point_int
                p_offsets[i] = p_offset
                dimensions[i] = np.array(a["dimensions"])
                locations[i] = locs
                rotys[i] = rot_y
                reg_mask[i] = 1 if not affine else 0
                flip_mask[i] = 1 if not affine and flipped else 0

        target = ParamsList(image_size=img.size, is_train=self.is_train)
        target.add_field("hm", heat_map)
        target.add_field("reg", regression)
        target.add_field("cls_ids", cls_ids)
        target.add_field("proj_p", proj_points)
        target.add_field("dimensions", dimensions)
        target.add_field("locations", locations)
        target.add_field("rotys", rotys)
        target.add_field("trans_mat", trans_mat)
        target.add_field("K", K)
        target.add_field("reg_mask", reg_mask)
        target.add_field("flip_mask", flip_mask)

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target, idx
Exemple #17
0
def main():
  SCENE_SPLITS['mini-val'] = SCENE_SPLITS['val']
  if not os.path.exists(OUT_PATH):
    os.mkdir(OUT_PATH)
  for split in SPLITS:
    data_path = DATA_PATH + '{}/'.format(SPLITS[split])
    nusc = NuScenes(
      version=SPLITS[split], dataroot=data_path, verbose=True)
    out_path = OUT_PATH + '{}.json'.format(split)
    categories_info = [{'name': CATS[i], 'id': i + 1} for i in range(len(CATS))]
    ret = {'images': [], 'annotations': [], 'categories': categories_info, 
           'videos': [], 'attributes': ATTRIBUTE_TO_ID}
    num_images = 0
    num_anns = 0
    num_videos = 0

    # A "sample" in nuScenes refers to a timestamp with 6 cameras and 1 LIDAR.
    for sample in nusc.sample:
      scene_name = nusc.get('scene', sample['scene_token'])['name']
      if not (split in ['mini', 'test']) and \
        not (scene_name in SCENE_SPLITS[split]):
        continue
      if sample['prev'] == '':
        print('scene_name', scene_name)
        num_videos += 1
        ret['videos'].append({'id': num_videos, 'file_name': scene_name})
        frame_ids = {k: 0 for k in sample['data']}
        track_ids = {}
      # We decompose a sample into 6 images in our case.
      for sensor_name in sample['data']:
        if sensor_name in USED_SENSOR:
          image_token = sample['data'][sensor_name]
          image_data = nusc.get('sample_data', image_token)
          num_images += 1

          # Complex coordinate transform. This will take time to understand.
          sd_record = nusc.get('sample_data', image_token)
          cs_record = nusc.get(
            'calibrated_sensor', sd_record['calibrated_sensor_token'])
          pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
          global_from_car = transform_matrix(pose_record['translation'],
            Quaternion(pose_record['rotation']), inverse=False)
          car_from_sensor = transform_matrix(
            cs_record['translation'], Quaternion(cs_record['rotation']),
            inverse=False)
          trans_matrix = np.dot(global_from_car, car_from_sensor)
          _, boxes, camera_intrinsic = nusc.get_sample_data(
            image_token, box_vis_level=BoxVisibility.ANY)
          calib = np.eye(4, dtype=np.float32)
          calib[:3, :3] = camera_intrinsic
          calib = calib[:3]
          frame_ids[sensor_name] += 1

          # image information in COCO format
          image_info = {'id': num_images,
                        'file_name': image_data['filename'],
                        'calib': calib.tolist(), 
                        'video_id': num_videos,
                        'frame_id': frame_ids[sensor_name],
                        'sensor_id': SENSOR_ID[sensor_name],
                        'sample_token': sample['token'],
                        'trans_matrix': trans_matrix.tolist(),
                        'width': sd_record['width'],
                        'height': sd_record['height'],
                        'pose_record_trans': pose_record['translation'],
                        'pose_record_rot': pose_record['rotation'],
                        'cs_record_trans': cs_record['translation'],
                        'cs_record_rot': cs_record['rotation']}
          ret['images'].append(image_info)
          anns = []
          for box in boxes:
            det_name = category_to_detection_name(box.name)
            if det_name is None:
              continue
            num_anns += 1
            v = np.dot(box.rotation_matrix, np.array([1, 0, 0]))
            yaw = -np.arctan2(v[2], v[0])
            box.translate(np.array([0, box.wlh[2] / 2, 0]))
            category_id = CAT_IDS[det_name]

            amodel_center = project_to_image(
              np.array([box.center[0], box.center[1] - box.wlh[2] / 2, box.center[2]], 
                np.float32).reshape(1, 3), calib)[0].tolist()
            sample_ann = nusc.get(
              'sample_annotation', box.token)
            instance_token = sample_ann['instance_token']
            if not (instance_token in track_ids):
              track_ids[instance_token] = len(track_ids) + 1
            attribute_tokens = sample_ann['attribute_tokens']
            attributes = [nusc.get('attribute', att_token)['name'] \
              for att_token in attribute_tokens]
            att = '' if len(attributes) == 0 else attributes[0]
            if len(attributes) > 1:
              print(attributes)
              import pdb; pdb.set_trace()
            track_id = track_ids[instance_token]
            vel = nusc.box_velocity(box.token) # global frame
            vel = np.dot(np.linalg.inv(trans_matrix), 
              np.array([vel[0], vel[1], vel[2], 0], np.float32)).tolist()
            
            # instance information in COCO format
            ann = {
              'id': num_anns,
              'image_id': num_images,
              'category_id': category_id,
              'dim': [box.wlh[2], box.wlh[0], box.wlh[1]],
              'location': [box.center[0], box.center[1], box.center[2]],
              'depth': box.center[2],
              'occluded': 0,
              'truncated': 0,
              'rotation_y': yaw,
              'amodel_center': amodel_center,
              'iscrowd': 0,
              'track_id': track_id,
              'attributes': ATTRIBUTE_TO_ID[att],
              'velocity': vel
            }

            bbox = KittiDB.project_kitti_box_to_image(
              copy.deepcopy(box), camera_intrinsic, imsize=(1600, 900))
            alpha = _rot_y2alpha(yaw, (bbox[0] + bbox[2]) / 2, 
                                 camera_intrinsic[0, 2], camera_intrinsic[0, 0])
            ann['bbox'] = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
            ann['area'] = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
            ann['alpha'] = alpha
            anns.append(ann)

          # Filter out bounding boxes outside the image
          visable_anns = []
          for i in range(len(anns)):
            vis = True
            for j in range(len(anns)):
              if anns[i]['depth'] - min(anns[i]['dim']) / 2 > \
                 anns[j]['depth'] + max(anns[j]['dim']) / 2 and \
                _bbox_inside(anns[i]['bbox'], anns[j]['bbox']):
                vis = False
                break
            if vis:
              visable_anns.append(anns[i])
            else:
              pass

          for ann in visable_anns:
            ret['annotations'].append(ann)
          if DEBUG:
            img_path = data_path + image_info['file_name']
            img = cv2.imread(img_path)
            img_3d = img.copy()
            for ann in visable_anns:
              bbox = ann['bbox']
              cv2.rectangle(img, (int(bbox[0]), int(bbox[1])), 
                            (int(bbox[2] + bbox[0]), int(bbox[3] + bbox[1])), 
                            (0, 0, 255), 3, lineType=cv2.LINE_AA)
              box_3d = compute_box_3d(ann['dim'], ann['location'], ann['rotation_y'])
              box_2d = project_to_image(box_3d, calib)
              img_3d = draw_box_3d(img_3d, box_2d)

              pt_3d = unproject_2d_to_3d(ann['amodel_center'], ann['depth'], calib)
              pt_3d[1] += ann['dim'][0] / 2
              print('location', ann['location'])
              print('loc model', pt_3d)
              pt_2d = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
                                dtype=np.float32)
              pt_3d = unproject_2d_to_3d(pt_2d, ann['depth'], calib)
              pt_3d[1] += ann['dim'][0] / 2
              print('loc      ', pt_3d)
            cv2.imshow('img', img)
            cv2.imshow('img_3d', img_3d)
            cv2.waitKey()
            nusc.render_sample_data(image_token)
            plt.show()
    print('reordering images')
    images = ret['images']
    video_sensor_to_images = {}
    for image_info in images:
      tmp_seq_id = image_info['video_id'] * 20 + image_info['sensor_id']
      if tmp_seq_id in video_sensor_to_images:
        video_sensor_to_images[tmp_seq_id].append(image_info)
      else:
        video_sensor_to_images[tmp_seq_id] = [image_info]
    ret['images'] = []
    for tmp_seq_id in sorted(video_sensor_to_images):
      ret['images'] = ret['images'] + video_sensor_to_images[tmp_seq_id]

    print('{} {} images {} boxes'.format(
      split, len(ret['images']), len(ret['annotations'])))
    print('out_path', out_path)
    json.dump(ret, open(out_path, 'w'))