def get_object_masks(nuscenes, sample_data, extents, resolution): # Initialize object masks nclass = len(DETECTION_NAMES) + 1 grid_width = int((extents[2] - extents[0]) / resolution) grid_height = int((extents[3] - extents[1]) / resolution) masks = np.zeros((nclass, grid_height, grid_width), dtype=np.uint8) # Get the 2D affine transform from bev coords to map coords tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]] inv_tfm = np.linalg.inv(tfm) for box in nuscenes.get_boxes(sample_data['token']): # Get the index of the class det_name = category_to_detection_name(box.name) if det_name not in DETECTION_NAMES: class_id = -1 else: class_id = DETECTION_NAMES.index(det_name) # Get bounding box coordinates in the grid coordinate frame bbox = box.bottom_corners()[:2] local_bbox = np.dot(inv_tfm[:2, :2], bbox).T + inv_tfm[:2, 2] # Render the rotated bounding box to the mask render_polygon(masks[class_id], local_bbox, extents, resolution) return masks.astype(np.bool)
def random_class(category_name: str) -> str: # Alter 10% of the valid labels. class_names = sorted(DETECTION_NAMES) tmp = category_to_detection_name(category_name) if tmp is not None and np.random.rand() < .9: return tmp else: return class_names[np.random.randint(0, len(class_names) - 1)]
def random_class(category_name): class_names = ['barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck'] tmp = category_to_detection_name(category_name) if tmp is not None and np.random.rand() < .9: return tmp else: return class_names[np.random.randint(0, 9)]
def parse_labels( data: NuScenes, boxes: List[Box], ego_pose: DictStrAny, calib_sensor: DictStrAny, img_size: Optional[Tuple[int, int]] = None, ) -> Optional[List[Label]]: """Parse NuScenes labels into sensor frame.""" if len(boxes): labels = [] # transform into the sensor coord system transform_boxes(boxes, ego_pose, calib_sensor) intrinsic_matrix: NDArrayF64 = np.array( calib_sensor["camera_intrinsic"], dtype=np.float64) for box in boxes: box_class = category_to_detection_name(box.name) in_image = True if img_size is not None: in_image = box_in_image(box, intrinsic_matrix, img_size) if in_image and box_class is not None: xyz = tuple(box.center.tolist()) w, l, h = box.wlh roty = quaternion_to_yaw(box.orientation) box2d = None if img_size is not None: # Project 3d box to 2d. corners = box.corners() corner_coords = (view_points(corners, intrinsic_matrix, True).T[:, :2].tolist()) # Keep only corners that fall within the image, transform box2d = xyxy_to_box2d(*post_process_coords(corner_coords)) instance_data = data.get("sample_annotation", box.token) # Attributes can be retrieved via instance_data and also the # category is more fine-grained than box_class. # This information could be stored in attributes if needed in # the future label = Label( id=instance_data["instance_token"], category=box_class, box2d=box2d, box3d=Box3D( location=xyz, dimension=(h, w, l), orientation=(0, roty, 0), alpha=rotation_y_to_alpha(roty, xyz), # type: ignore ), ) labels.append(label) return labels return None
def load_gt(nusc, eval_split: str, verbose: bool = False) -> EvalBoxes: """ Loads ground truth boxes from DB. """ # Init. attribute_map = {a['token']: a['name'] for a in nusc.attribute} if verbose: print('Loading annotations for {} split from nuScenes version: {}'. format(eval_split, nusc.version)) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in nusc.sample] assert len(sample_tokens_all) > 0, "Error: Database has no samples!" # Only keep samples from this split. splits = create_splits_scenes() # Check compatibility of split with nusc_version. version = nusc.version if eval_split in {'train', 'val', 'train_detect', 'train_track'}: assert version.endswith('trainval'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split in {'mini_train', 'mini_val'}: assert version.endswith('mini'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split == 'test': assert version.endswith('test'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) else: raise ValueError( 'Error: Requested split {} which this function cannot map to the correct NuScenes version.' .format(eval_split)) if eval_split == 'test': # Check that you aren't trying to cheat :). assert len(nusc.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set but you do not have the annotations!' sample_tokens = [] for sample_token in sample_tokens_all: scene_token = nusc.get('sample', sample_token)['scene_token'] scene_record = nusc.get('scene', scene_token) if scene_record['name'] in splits[eval_split]: sample_tokens.append(sample_token) all_annotations = EvalBoxes() # Load annotations and filter predictions and annotations. for sample_token in tqdm.tqdm(sample_tokens): sample = nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] sample_boxes = [] for sample_annotation_token in sample_annotation_tokens: # Get label name in detection task and filter unused labels. sample_annotation = nusc.get('sample_annotation', sample_annotation_token) detection_name = category_to_detection_name( sample_annotation['category_name']) if detection_name is None: continue # Get attribute_name. attr_tokens = sample_annotation['attribute_tokens'] attr_count = len(attr_tokens) if attr_count == 0: attribute_name = '' elif attr_count == 1: attribute_name = attribute_map[attr_tokens[0]] else: raise Exception( 'Error: GT annotations must not have more than one attribute!' ) sample_boxes.append( EvalBox( sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity(sample_annotation['token'])[:2], detection_name=detection_name, detection_score=-1.0, # GT samples do not have a score. attribute_name=attribute_name, num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'])) all_annotations.add_boxes(sample_token, sample_boxes) if verbose: print("Loaded ground truth annotations for {} samples.".format( len(all_annotations.sample_tokens))) return all_annotations
def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False) -> EvalBoxes: """ Loads ground truth boxes from DB. :param nusc: A NuScenes instance. :param eval_split: The evaluation split for which we load GT boxes. :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. :param verbose: Whether to print messages to stdout. :return: The GT boxes. """ # Init. if box_cls == DetectionBox: attribute_map = {a['token']: a['name'] for a in nusc.attribute} if verbose: print('Loading annotations for {} split from nuScenes version: {}'. format(eval_split, nusc.version)) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in nusc.sample] assert len(sample_tokens_all) > 0, "Error: Database has no samples!" # Only keep samples from this split. splits = create_splits_scenes() # Check compatibility of split with nusc_version. version = nusc.version if eval_split in {'train', 'val', 'train_detect', 'train_track'}: assert version.endswith('trainval'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split in {'mini_train', 'mini_val'}: assert version.endswith('mini'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split == 'test': assert version.endswith('test'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) else: raise ValueError( 'Error: Requested split {} which this function cannot map to the correct NuScenes version.' .format(eval_split)) if eval_split == 'test': # Check that you aren't trying to cheat :). assert len(nusc.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set but you do not have the annotations!' sample_tokens = [] for sample_token in sample_tokens_all: scene_token = nusc.get('sample', sample_token)['scene_token'] scene_record = nusc.get('scene', scene_token) if scene_record['name'] in splits[eval_split]: sample_tokens.append(sample_token) all_annotations = EvalBoxes() # Load annotations and filter predictions and annotations. tracking_id_set = set() for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): sample = nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] sample_boxes = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = nusc.get('sample_annotation', sample_annotation_token) if box_cls == DetectionBox: # Get label name in detection task and filter unused labels. detection_name = category_to_detection_name( sample_annotation['category_name']) if detection_name is None: continue # Get attribute_name. attr_tokens = sample_annotation['attribute_tokens'] attr_count = len(attr_tokens) if attr_count == 0: attribute_name = '' elif attr_count == 1: attribute_name = attribute_map[attr_tokens[0]] else: raise Exception( 'Error: GT annotations must not have more than one attribute!' ) sample_boxes.append( box_cls( sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity( sample_annotation['token'])[:2], num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], detection_name=detection_name, detection_score=-1.0, # GT samples do not have a score. attribute_name=attribute_name)) elif box_cls == TrackingBox: # Use nuScenes token as tracking id. tracking_id = sample_annotation['instance_token'] tracking_id_set.add(tracking_id) # Get label name in detection task and filter unused labels. tracking_name = category_to_tracking_name( sample_annotation['category_name']) if tracking_name is None: continue sample_boxes.append( box_cls( sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity( sample_annotation['token'])[:2], num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], tracking_id=tracking_id, tracking_name=tracking_name, tracking_score=-1.0 # GT samples do not have a score. )) else: raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) all_annotations.add_boxes(sample_token, sample_boxes) if verbose: print("Loaded ground truth annotations for {} samples.".format( len(all_annotations.sample_tokens))) return all_annotations
def preprocess(nusc, split_names, root_dir, out_dir, keyword=None, keyword_action=None, subset_name=None, location=None): # cannot process day/night and location at the same time assert not (bool(keyword) and bool(location)) if keyword: assert keyword_action in ['filter', 'exclude'] # init dict to save pkl_dict = {} for split_name in split_names: pkl_dict[split_name] = [] for i, sample in enumerate(nusc.sample): curr_scene_name = nusc.get('scene', sample['scene_token'])['name'] # get if the current scene is in train, val or test curr_split = None for split_name in split_names: if curr_scene_name in getattr(splits, split_name): curr_split = split_name break if curr_split is None: continue if subset_name == 'night': if curr_split == 'train': if curr_scene_name in splits.val_night: curr_split = 'val' if subset_name == 'singapore': if curr_split == 'train': if curr_scene_name in splits.val_singapore: curr_split = 'val' # filter for day/night if keyword: scene_description = nusc.get("scene", sample["scene_token"])["description"] if keyword.lower() in scene_description.lower(): if keyword_action == 'exclude': # skip sample continue else: if keyword_action == 'filter': # skip sample continue if location: scene = nusc.get("scene", sample["scene_token"]) if location not in nusc.get("log", scene['log_token'])['location']: continue lidar_token = sample["data"]["LIDAR_TOP"] cam_front_token = sample["data"]["CAM_FRONT"] lidar_path, boxes_lidar, _ = nusc.get_sample_data(lidar_token) cam_path, boxes_front_cam, cam_intrinsic = nusc.get_sample_data( cam_front_token) print('{}/{} {} {}'.format(i + 1, len(nusc.sample), curr_scene_name, lidar_path)) sd_rec_lidar = nusc.get('sample_data', sample['data']["LIDAR_TOP"]) cs_record_lidar = nusc.get('calibrated_sensor', sd_rec_lidar['calibrated_sensor_token']) pose_record_lidar = nusc.get('ego_pose', sd_rec_lidar['ego_pose_token']) sd_rec_cam = nusc.get('sample_data', sample['data']["CAM_FRONT"]) cs_record_cam = nusc.get('calibrated_sensor', sd_rec_cam['calibrated_sensor_token']) pose_record_cam = nusc.get('ego_pose', sd_rec_cam['ego_pose_token']) calib_infos = { "lidar2ego_translation": cs_record_lidar['translation'], "lidar2ego_rotation": cs_record_lidar['rotation'], "ego2global_translation_lidar": pose_record_lidar['translation'], "ego2global_rotation_lidar": pose_record_lidar['rotation'], "ego2global_translation_cam": pose_record_cam['translation'], "ego2global_rotation_cam": pose_record_cam['rotation'], "cam2ego_translation": cs_record_cam['translation'], "cam2ego_rotation": cs_record_cam['rotation'], "cam_intrinsic": cam_intrinsic, } # load lidar points pts = np.fromfile(lidar_path, dtype=np.float32, count=-1).reshape([-1, 5])[:, :3].T # map point cloud into front camera image pts_valid_flag, pts_cam_coord, pts_img = map_pointcloud_to_image( pts, (900, 1600, 3), calib_infos) # fliplr so that indexing is row, col and not col, row pts_img = np.ascontiguousarray(np.fliplr(pts_img)) # only use lidar points in the front camera image pts = pts[:, pts_valid_flag] num_pts = pts.shape[1] seg_labels = np.full(num_pts, fill_value=len(class_names_to_id), dtype=np.uint8) # only use boxes that are visible in camera valid_box_tokens = [box.token for box in boxes_front_cam] boxes = [box for box in boxes_lidar if box.token in valid_box_tokens] for box in boxes: # get points that lie inside of the box fg_mask = points_in_box(box, pts) det_class = category_to_detection_name(box.name) if det_class is not None: seg_labels[fg_mask] = class_names_to_id[det_class] # convert to relative path lidar_path = lidar_path.replace(root_dir + '/', '') cam_path = cam_path.replace(root_dir + '/', '') # transpose to yield shape (num_points, 3) pts = pts.T # append data to train, val or test list in pkl_dict data_dict = { 'points': pts, 'seg_labels': seg_labels, 'points_img': pts_img, # row, col format, shape: (num_points, 2) 'lidar_path': lidar_path, 'camera_path': cam_path, 'boxes': boxes_lidar, "sample_token": sample["token"], "scene_name": curr_scene_name, "calib": calib_infos } pkl_dict[curr_split].append(data_dict) # save to pickle file save_dir = osp.join(out_dir, 'preprocess') os.makedirs(save_dir, exist_ok=True) for split_name in split_names: save_path = osp.join( save_dir, '{}{}.pkl'.format(split_name, '_' + subset_name if subset_name else '')) with open(save_path, 'wb') as f: pickle.dump(pkl_dict[split_name], f) print('Wrote preprocessed data to ' + save_path)
def load_annotations(self, idx): annotations = [] """ Converts nuScenes GT annotations to KITTI format. """ kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2) kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse sample_token = self.sample_tokens[idx] sample = self.nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] cam_front_token = sample['data'][self.cam_name] lidar_token = sample['data'][self.lidar_name] # Retrieve sensor records. sd_record_cam = self.nusc.get('sample_data', cam_front_token) sd_record_lid = self.nusc.get('sample_data', lidar_token) cs_record_cam = self.nusc.get('calibrated_sensor', sd_record_cam['calibrated_sensor_token']) cs_record_lid = self.nusc.get('calibrated_sensor', sd_record_lid['calibrated_sensor_token']) # Combine transformations and convert to KITTI format. # Note: cam uses same conventions in KITTI and nuScenes. lid_to_ego = transform_matrix(cs_record_lid['translation'], Quaternion(cs_record_lid['rotation']), inverse=False) ego_to_cam = transform_matrix(cs_record_cam['translation'], Quaternion(cs_record_cam['rotation']), inverse=True) velo_to_cam = np.dot(ego_to_cam, lid_to_ego) # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam. velo_to_cam_kitti = np.dot(velo_to_cam, kitti_to_nu_lidar.transformation_matrix) # Currently not used. imu_to_velo_kitti = np.zeros((3, 4), dtype=np.float32) # Dummy values. r0_rect = Quaternion(axis=[1, 0, 0], angle=0) # Dummy values. # Projection matrix. p_left_kitti = np.zeros((3, 4), dtype=np.float32) p_left_kitti[:3, :3] = cs_record_cam[ 'camera_intrinsic'] # Cameras are always rectified. p_left_kitti = p_left_kitti[:3, :3] # Create KITTI style transforms. velo_to_cam_rot = velo_to_cam_kitti[:3, :3] velo_to_cam_trans = velo_to_cam_kitti[:3, 3] # Check that the rotation has the same format as in KITTI. assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]])).all() assert (velo_to_cam_trans[1:3] < 0).all() # Retrieve the token from the lidar. # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar, # not the camera. filename_cam_full = sd_record_cam['filename'] filename_lid_full = sd_record_lid['filename'] if self.is_train: for sample_annotation_token in sample_annotation_tokens: sample_annotation = self.nusc.get('sample_annotation', sample_annotation_token) # Convert nuScenes category to nuScenes detection challenge category. detection_name = category_to_detection_name( sample_annotation['category_name']) if detection_name in self.classes: # Get box in LIDAR frame. _, box_lidar_nusc, _ = self.nusc.get_sample_data( lidar_token, box_vis_level=BoxVisibility.NONE, selected_anntokens=[sample_annotation_token]) box_lidar_nusc = box_lidar_nusc[0] # Convert from nuScenes to KITTI box format. box_cam_kitti = KittiDB.box_nuscenes_to_kitti( box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect) # Convert quaternion to yaw angle. v = np.dot(box_cam_kitti.rotation_matrix, np.array([1, 0, 0])) yaw = -np.arctan2(v[2], v[0]) annotations.append({ "class": detection_name, "label": TYPE_ID_CONVERSION[detection_name], "dimensions": [ float(box_cam_kitti.wlh[2]), float(box_cam_kitti.wlh[0]), float(box_cam_kitti.wlh[1]) ], "locations": [ float(box_cam_kitti.center[0]), float(box_cam_kitti.center[1]), float(box_cam_kitti.center[2]) ], "rot_y": float(yaw) }) return annotations, p_left_kitti
def nuscenes_gt_to_kitti(self) -> None: """ Converts nuScenes GT annotations to KITTI format. """ kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2) kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse imsize = (1600, 900) token_idx = 0 # Start tokens from 0. # Get assignment of scenes to splits. split_logs = create_splits_logs(self.split, self.nusc) # Create output folders. label_folder = os.path.join(self.nusc_kitti_dir, self.split, 'label_2') calib_folder = os.path.join(self.nusc_kitti_dir, self.split, 'calib') image_folder = os.path.join(self.nusc_kitti_dir, self.split, 'image_2') lidar_folder = os.path.join(self.nusc_kitti_dir, self.split, 'velodyne') for folder in [label_folder, calib_folder, image_folder, lidar_folder]: if not os.path.isdir(folder): os.makedirs(folder) # Use only the samples from the current split. sample_tokens = self._split_to_samples(split_logs) sample_tokens = sample_tokens[:self.image_count] tokens = [] for sample_token in sample_tokens: # Get sample data. sample = self.nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] cam_front_token = sample['data'][self.cam_name] lidar_token = sample['data'][self.lidar_name] # Retrieve sensor records. sd_record_cam = self.nusc.get('sample_data', cam_front_token) sd_record_lid = self.nusc.get('sample_data', lidar_token) cs_record_cam = self.nusc.get( 'calibrated_sensor', sd_record_cam['calibrated_sensor_token']) cs_record_lid = self.nusc.get( 'calibrated_sensor', sd_record_lid['calibrated_sensor_token']) # Combine transformations and convert to KITTI format. # Note: cam uses same conventions in KITTI and nuScenes. lid_to_ego = transform_matrix(cs_record_lid['translation'], Quaternion( cs_record_lid['rotation']), inverse=False) ego_to_cam = transform_matrix(cs_record_cam['translation'], Quaternion( cs_record_cam['rotation']), inverse=True) velo_to_cam = np.dot(ego_to_cam, lid_to_ego) # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam. velo_to_cam_kitti = np.dot(velo_to_cam, kitti_to_nu_lidar.transformation_matrix) # Currently not used. imu_to_velo_kitti = np.zeros((3, 4)) # Dummy values. r0_rect = Quaternion(axis=[1, 0, 0], angle=0) # Dummy values. # Projection matrix. p_left_kitti = np.zeros((3, 4)) p_left_kitti[:3, :3] = cs_record_cam[ 'camera_intrinsic'] # Cameras are always rectified. # Create KITTI style transforms. velo_to_cam_rot = velo_to_cam_kitti[:3, :3] velo_to_cam_trans = velo_to_cam_kitti[:3, 3] # Check that the rotation has the same format as in KITTI. assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]])).all() assert (velo_to_cam_trans[1:3] < 0).all() # Retrieve the token from the lidar. # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar, # not the camera. filename_cam_full = sd_record_cam['filename'] filename_lid_full = sd_record_lid['filename'] # token = '%06d' % token_idx # Alternative to use KITTI names. token_idx += 1 # Convert image (jpg to png). src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full) dst_im_path = os.path.join(image_folder, sample_token + '.png') if not os.path.exists(dst_im_path): im = Image.open(src_im_path) im.save(dst_im_path, "PNG") # Convert lidar. # Note that we are only using a single sweep, instead of the commonly used n sweeps. src_lid_path = os.path.join(self.nusc.dataroot, filename_lid_full) dst_lid_path = os.path.join(lidar_folder, sample_token + '.bin') assert not dst_lid_path.endswith('.pcd.bin') pcl = LidarPointCloud.from_file(src_lid_path) pcl.rotate( kitti_to_nu_lidar_inv.rotation_matrix) # In KITTI lidar frame. with open(dst_lid_path, "w") as lid_file: pcl.points.T.tofile(lid_file) # Add to tokens. tokens.append(sample_token) # Create calibration file. kitti_transforms = dict() kitti_transforms['P0'] = np.zeros((3, 4)) # Dummy values. kitti_transforms['P1'] = np.zeros((3, 4)) # Dummy values. kitti_transforms['P2'] = p_left_kitti # Left camera transform. kitti_transforms['P3'] = np.zeros((3, 4)) # Dummy values. kitti_transforms[ 'R0_rect'] = r0_rect.rotation_matrix # Cameras are already rectified. kitti_transforms['Tr_velo_to_cam'] = np.hstack( (velo_to_cam_rot, velo_to_cam_trans.reshape(3, 1))) kitti_transforms['Tr_imu_to_velo'] = imu_to_velo_kitti calib_path = os.path.join(calib_folder, sample_token + '.txt') with open(calib_path, "w") as calib_file: for (key, val) in kitti_transforms.items(): val = val.flatten() val_str = '%.12e' % val[0] for v in val[1:]: val_str += ' %.12e' % v calib_file.write('%s: %s\n' % (key, val_str)) # Write label file. label_path = os.path.join(label_folder, sample_token + '.txt') if os.path.exists(label_path): print('Skipping existing file: %s' % label_path) continue else: print('Writing file: %s' % label_path) with open(label_path, "w") as label_file: for sample_annotation_token in sample_annotation_tokens: sample_annotation = self.nusc.get('sample_annotation', sample_annotation_token) # Get box in LIDAR frame. _, box_lidar_nusc, _ = self.nusc.get_sample_data( lidar_token, box_vis_level=BoxVisibility.NONE, selected_anntokens=[sample_annotation_token]) box_lidar_nusc = box_lidar_nusc[0] # Truncated: Set all objects to 0 which means untruncated. truncated = 0.0 # Occluded: Set all objects to full visibility as this information is not available in nuScenes. occluded = 0 # Convert nuScenes category to nuScenes detection challenge category. detection_name = category_to_detection_name( sample_annotation['category_name']) # Skip categories that are not part of the nuScenes detection challenge. if detection_name is None: continue # Convert from nuScenes to KITTI box format. box_cam_kitti = KittiDB.box_nuscenes_to_kitti( box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect) # Project 3d box to 2d box in image, ignore box if it does not fall inside. bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti, p_left_kitti, imsize=imsize) if bbox_2d is None: continue # Set dummy score so we can use this file as result. box_cam_kitti.score = 0 # Convert box to output string format. output = KittiDB.box_to_string(name=detection_name, box=box_cam_kitti, bbox_2d=bbox_2d, truncation=truncated, occlusion=occluded) # Write to disk. label_file.write(output + '\n')
def nuscenes_gt_to_kitti(self) -> None: """ Converts nuScenes GT annotations to KITTI format. """ kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2) kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse imsize = (1600, 900) token_idx = 0 # Start tokens from 0. # Get assignment of scenes to splits. split_logs = create_splits_logs(self.split, self.nusc) scene_splits = create_splits_scenes(verbose=False) scene_to_log = { scene['name']: self.nusc.get('log', scene['log_token'])['logfile'] for scene in self.nusc.scene } logs = set() scenes = scene_splits[self.split] for scene in scenes: logs.add(scene_to_log[scene]) # print(len(scenes), len(logs)) split_mapping = {"train": "training", "val": "testing"} # Create output folders. label_folder = os.path.join(self.nusc_kitti_dir, split_mapping[self.split], 'label_2') calib_folder = os.path.join(self.nusc_kitti_dir, split_mapping[self.split], 'calib') image_folder = os.path.join(self.nusc_kitti_dir, split_mapping[self.split], 'image_2') lidar_folder = os.path.join(self.nusc_kitti_dir, split_mapping[self.split], 'velodyne') for folder in [label_folder, calib_folder, image_folder, lidar_folder]: if not os.path.isdir(folder): os.makedirs(folder) # Use only the samples from the current split. sample_tokens = self._split_to_samples(split_logs) # sample_tokens = sample_tokens[:self.image_count] # print(len(sample_tokens)) tokens = [] if self.split == "train": split_file = [ os.path.join(self.nusc_kitti_dir, "train.txt"), os.path.join(self.nusc_kitti_dir, "val.txt") ] elif self.split == 'val': split_file = os.path.join(self.nusc_kitti_dir, "test.txt") # if os.path.isfile(split_file): # os.remove(split_file) if self.split == "train": cnt = 0 with open(split_file[0], "w") as f: for seq in list(self.sequence_mapping.keys())[:-150]: for tk in self.sequence_mapping[seq]: f.write("%06d" % tk + "\n") cnt += 1 # print(cnt) cnt = 0 with open(split_file[1], "w") as f: for seq in list(self.sequence_mapping.keys())[-150:]: for tk in self.sequence_mapping[seq]: f.write("%06d" % tk + "\n") cnt += 1 # print(cnt) elif self.split == "val": with open(split_file, "w") as f: for seq in self.sequence_mapping.keys(): for tk in self.sequence_mapping[seq]: f.write("%06d" % tk + "\n") for idx, sample_token in enumerate(sample_tokens): # Get sample data. sample = self.nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] cam_front_token = sample['data'][self.cam_name] lidar_token = sample['data'][self.lidar_name] sample_name = "%06d" % idx # Retrieve sensor records. sd_record_cam = self.nusc.get('sample_data', cam_front_token) sd_record_lid = self.nusc.get('sample_data', lidar_token) cs_record_cam = self.nusc.get( 'calibrated_sensor', sd_record_cam['calibrated_sensor_token']) cs_record_lid = self.nusc.get( 'calibrated_sensor', sd_record_lid['calibrated_sensor_token']) # Combine transformations and convert to KITTI format. # Note: cam uses same conventions in KITTI and nuScenes. lid_to_ego = transform_matrix(cs_record_lid['translation'], Quaternion( cs_record_lid['rotation']), inverse=False) ego_to_cam = transform_matrix(cs_record_cam['translation'], Quaternion( cs_record_cam['rotation']), inverse=True) velo_to_cam = np.dot(ego_to_cam, lid_to_ego) # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam. velo_to_cam_kitti = np.dot(velo_to_cam, kitti_to_nu_lidar.transformation_matrix) # Currently not used. imu_to_velo_kitti = np.zeros((3, 4)) # Dummy values. r0_rect = Quaternion(axis=[1, 0, 0], angle=0) # Dummy values. # Projection matrix. p_left_kitti = np.zeros((3, 4)) p_left_kitti[:3, :3] = cs_record_cam[ 'camera_intrinsic'] # Cameras are always rectified. # Create KITTI style transforms. velo_to_cam_rot = velo_to_cam_kitti[:3, :3] velo_to_cam_trans = velo_to_cam_kitti[:3, 3] # Check that the rotation has the same format as in KITTI. assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]])).all() assert (velo_to_cam_trans[1:3] < 0).all() # Retrieve the token from the lidar. # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar, # not the camera. filename_cam_full = sd_record_cam['filename'] filename_lid_full = sd_record_lid['filename'] # token = '%06d' % token_idx # Alternative to use KITTI names. token_idx += 1 # Convert image (jpg to png). src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full) dst_im_path = os.path.join(image_folder, sample_name + '.png') if not os.path.exists(dst_im_path): im = Image.open(src_im_path) im.save(dst_im_path, "PNG") # Convert lidar. # Note that we are only using a single sweep, instead of the commonly used n sweeps. src_lid_path = os.path.join(self.nusc.dataroot, filename_lid_full) dst_lid_path = os.path.join(lidar_folder, sample_name + '.bin') assert not dst_lid_path.endswith('.pcd.bin') pcl = LidarPointCloud.from_file(src_lid_path) # pcl, _ = LidarPointCloud.from_file_multisweep_future(self.nusc, sample, self.lidar_name, self.lidar_name, nsweeps=5) pcl.rotate( kitti_to_nu_lidar_inv.rotation_matrix) # In KITTI lidar frame. with open(dst_lid_path, "w") as lid_file: pcl.points.T.tofile(lid_file) # Add to tokens. tokens.append(sample_token) # Create calibration file. kitti_transforms = dict() kitti_transforms['P0'] = np.zeros((3, 4)) # Dummy values. kitti_transforms['P1'] = np.zeros((3, 4)) # Dummy values. kitti_transforms['P2'] = p_left_kitti # Left camera transform. kitti_transforms['P3'] = np.zeros((3, 4)) # Dummy values. kitti_transforms[ 'R0_rect'] = r0_rect.rotation_matrix # Cameras are already rectified. kitti_transforms['Tr_velo_to_cam'] = np.hstack( (velo_to_cam_rot, velo_to_cam_trans.reshape(3, 1))) kitti_transforms['Tr_imu_to_velo'] = imu_to_velo_kitti calib_path = os.path.join(calib_folder, sample_name + '.txt') with open(calib_path, "w") as calib_file: for (key, val) in kitti_transforms.items(): val = val.flatten() val_str = '%.12e' % val[0] for v in val[1:]: val_str += ' %.12e' % v calib_file.write('%s: %s\n' % (key, val_str)) # Write label file. label_path = os.path.join(label_folder, sample_name + '.txt') if os.path.exists(label_path): # print('Skipping existing file: %s' % label_path) continue # else: # print('Writing file: %s' % label_path) objects = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = self.nusc.get('sample_annotation', sample_annotation_token) # Get box in LIDAR frame. _, box_lidar_nusc, _ = self.nusc.get_sample_data( lidar_token, box_vis_level=BoxVisibility.NONE, selected_anntokens=[sample_annotation_token]) box_lidar_nusc = box_lidar_nusc[0] # Truncated: Set all objects to 0 which means untruncated. truncated = 0.0 # Occluded: Set all objects to full visibility as this information is not available in nuScenes. occluded = 0 obj = dict() # Convert nuScenes category to nuScenes detection challenge category. obj["detection_name"] = category_to_detection_name( sample_annotation['category_name']) # Skip categories that are not part of the nuScenes detection challenge. if obj["detection_name"] is None or obj[ "detection_name"] not in CLASS_MAP.keys(): continue obj["detection_name"] = CLASS_MAP[obj["detection_name"]] # Convert from nuScenes to KITTI box format. obj["box_cam_kitti"] = KittiDB.box_nuscenes_to_kitti( box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect) # Project 3d box to 2d box in image, ignore box if it does not fall inside. bbox_2d = project_to_2d(obj["box_cam_kitti"], p_left_kitti, imsize[1], imsize[0]) if bbox_2d is None: continue obj["bbox_2d"] = bbox_2d["bbox"] obj["truncated"] = bbox_2d["truncated"] # Set dummy score so we can use this file as result. obj["box_cam_kitti"].score = 0 v = np.dot(obj["box_cam_kitti"].rotation_matrix, np.array([1, 0, 0])) rot_y = -np.arctan2(v[2], v[0]) obj["alpha"] = -np.arctan2( obj["box_cam_kitti"].center[0], obj["box_cam_kitti"].center[2]) + rot_y obj["depth"] = np.linalg.norm( np.array(obj["box_cam_kitti"].center[:3])) objects.append(obj) objects = postprocessing(objects, imsize[1], imsize[0]) with open(label_path, "w") as label_file: for obj in objects: # Convert box to output string format. output = box_to_string(name=obj["detection_name"], box=obj["box_cam_kitti"], bbox_2d=obj["bbox_2d"], truncation=obj["truncated"], occlusion=obj["occluded"], alpha=obj["alpha"]) label_file.write(output + '\n')
def compute_labels_image(nusc, sample, sensor, nu_to_kitti_lidar, p): resolution = p['pointcloud_grid_map_interface']['grids']['cartesian']['resolution']['x'] if resolution - p['pointcloud_grid_map_interface']['grids']['cartesian']['resolution']['y'] > 0.001: raise ValueError('Grid Map resolution in x and y direction need to be equal') length = p['pointcloud_grid_map_interface']['grids']['cartesian']['range']['x'] width = p['pointcloud_grid_map_interface']['grids']['cartesian']['range']['y'] grid_map_origin_idx = np.array( [length / 2 + p['pointcloud_grid_map_interface']['grids']['cartesian']['offset']['x'], width / 2]) labels_corners = [] labels_center = [] labels_data = [] for annotation_token in sample['anns']: annotation_metadata = nusc.get('sample_annotation', annotation_token) if annotation_metadata['num_lidar_pts'] == 0: continue _, box_lidar, _ = nusc.get_sample_data(sample['data'][sensor], box_vis_level=BoxVisibility.NONE, selected_anntokens=[annotation_token]) box_lidar = box_lidar[0] box_lidar.rotate(nu_to_kitti_lidar) detection_name = category_to_detection_name(annotation_metadata['category_name']) if detection_name is None: continue # corners_obj: 4 * 8 matrix, each clomun indicates a corner (l, w, h) of a 3d bounding box corners_obj = np.array( [[box_lidar.wlh[1] / 2, box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2, box_lidar.wlh[1] / 2, box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2, - box_lidar.wlh[1] / 2], [box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, box_lidar.wlh[0] / 2, box_lidar.wlh[0] / 2, - box_lidar.wlh[0] / 2, -box_lidar.wlh[0] / 2, box_lidar.wlh[0] / 2], [box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2, box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2, - box_lidar.wlh[2] / 2], [1, 1, 1, 1, 1, 1, 1, 1]]) # tf_velo: box 3d pose affine transformation with respect to lidar origin tf_velo = np.array( [[box_lidar.rotation_matrix[0][0], box_lidar.rotation_matrix[0][1], box_lidar.rotation_matrix[0][2], box_lidar.center[0]], [box_lidar.rotation_matrix[1][0], box_lidar.rotation_matrix[1][1], box_lidar.rotation_matrix[1][2], box_lidar.center[1]], [box_lidar.rotation_matrix[2][0], box_lidar.rotation_matrix[2][1], box_lidar.rotation_matrix[2][2], box_lidar.center[2]], [0, 0, 0, 1]]) # tf_velo_to_image: affine transformation from lidar coordinate to image coordinate tf_velo_to_image = np.array([[0, -1, grid_map_origin_idx[1]], [-1, 0, grid_map_origin_idx[0]], [0, 0, 1]]) # corners_velo: corners in lidar coordinate system # corners_velo_x_y: corners_velo in x-y BEV plane corners_velo = tf_velo.dot(corners_obj) corners_velo_x_y = np.array([corners_velo[0], corners_velo[1], [1, 1, 1, 1, 1, 1, 1, 1]]) # corners_image: corners in grid map with unit meter # corners_image_idx: 8 * 2 matrix, corners in grid map with unit pixel corners_image = tf_velo_to_image.dot(corners_velo_x_y) corners_image_idx = np.array([corners_image[0] / resolution, corners_image[1] / resolution]) # label_t_image_normalized: 2 * 1 vector, box center in grid map with unit pixel label_t_velo = np.array([box_lidar.center[0], box_lidar.center[1], box_lidar.center[2], 1]) label_t_image = tf_velo_to_image.dot(np.array([label_t_velo[0], label_t_velo[1], 1])) label_t_image_normalized = np.array([label_t_image[0] / width, label_t_image[1] / length]) # Convert rotation matrix to yaw angle v = np.dot(box_lidar.rotation_matrix, np.array([1, 0, 0])) yaw = np.arctan2(v[1], v[0]) if 0 <= min(corners_image_idx[0]) \ and max(corners_image_idx[0]) < width / resolution \ and 0 <= min(corners_image_idx[1]) \ and max(corners_image_idx[1]) < length / resolution: labels_corners.append(corners_image_idx) labels_center.append(label_t_image_normalized) labels_data.append(Label(type=detection_name, l=box_lidar.wlh[1], w=box_lidar.wlh[0], h=box_lidar.wlh[2], rz=yaw)) return labels_corners, labels_center, labels_data
def load_merge_from_pkl(nusc: NuScenes, pkl_path: str, box_cls, verbose: bool = False) -> EvalBoxes: # Init. if box_cls == DetectionBox: attribute_map = {a['token']: a['name'] for a in nusc.attribute} if verbose: print('Loading annotations for {} split from nuScenes version: {}'. format(pkl_path, nusc.version)) import mmcv infos = mmcv.load(pkl_path)['infos'] samples = [] for info in infos: samples.append(nusc.get('sample', info['token'])) all_annotations = EvalBoxes() # Load annotations and filter predictions and annotations. merge_map = dict(car='vehicle', truck='vehicle', bus='vehicle', trailer='vehicle', construction_vehicle='vehicle', pedestrian='pedestrian', motorcycle='bike', bicycle='bike', traffic_cone='traffic_boundary', barrier='traffic_boundary') for sample in tqdm.tqdm(samples, leave=verbose): sample_token = sample['token'] cam_token = sample['data']['CAM_FRONT'] _, boxes_cam, _ = nusc.get_sample_data(cam_token) sample_annotation_tokens = [box.token for box in boxes_cam] # sample = nusc.get('sample', sample_token) # sample_annotation_tokens = sample['anns'] sample_boxes = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = nusc.get('sample_annotation', sample_annotation_token) if box_cls == DetectionBox: # Get label name in detection task and filter unused labels. detection_name = category_to_detection_name( sample_annotation['category_name']) if detection_name is None: continue detection_name = merge_map[detection_name] # Get attribute_name. attr_tokens = sample_annotation['attribute_tokens'] attr_count = len(attr_tokens) if attr_count == 0: attribute_name = '' elif attr_count == 1: attribute_name = attribute_map[attr_tokens[0]] else: raise Exception( 'Error: GT annotations must not have more than one attribute!' ) sample_boxes.append( box_cls( sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity( sample_annotation['token'])[:2], num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], detection_name=detection_name, detection_score=-1.0, # GT samples do not have a score. attribute_name=attribute_name)) else: raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) all_annotations.add_boxes(sample_token, sample_boxes) if verbose: print("Loaded ground truth annotations for {} samples.".format( len(all_annotations.sample_tokens))) return all_annotations
def main(): SCENE_SPLITS["mini-val"] = SCENE_SPLITS["val"] if not os.path.exists(DATA_PATH): os.mkdir(DATA_PATH) if not os.path.exists(OUT_PATH): os.mkdir(OUT_PATH) for split in SPLITS: data_path = DATA_PATH # + '{}/'.format(SPLITS[split]) nusc = NuScenes(version=SPLITS[split], dataroot=data_path, verbose=True) out_path = OUT_PATH + "{}.json".format(split) categories_info = [{ "name": CATS[i], "id": i + 1 } for i in range(len(CATS))] ret = { "images": [], "annotations": [], "categories": categories_info, "videos": [], "attributes": ATTRIBUTE_TO_ID, } num_images = 0 num_anns = 0 num_videos = 0 # A "sample" in nuScenes refers to a timestamp with 6 cameras and 1 LIDAR. for sample in nusc.sample: scene_name = nusc.get("scene", sample["scene_token"])["name"] if not (split in ["mini", "test" ]) and not (scene_name in SCENE_SPLITS[split]): continue if sample["prev"] == "": print("scene_name", scene_name) num_videos += 1 ret["videos"].append({ "id": num_videos, "file_name": scene_name }) frame_ids = {k: 0 for k in sample["data"]} track_ids = {} # We decompose a sample into 6 images in our case. for sensor_name in sample["data"]: if sensor_name in USED_SENSOR: image_token = sample["data"][sensor_name] image_data = nusc.get("sample_data", image_token) num_images += 1 # Complex coordinate transform. This will take time to understand. sd_record = nusc.get("sample_data", image_token) cs_record = nusc.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) pose_record = nusc.get("ego_pose", sd_record["ego_pose_token"]) global_from_car = transform_matrix( pose_record["translation"], Quaternion(pose_record["rotation"]), inverse=False, ) car_from_sensor = transform_matrix( cs_record["translation"], Quaternion(cs_record["rotation"]), inverse=False, ) trans_matrix = np.dot(global_from_car, car_from_sensor) _, boxes, camera_intrinsic = nusc.get_sample_data( image_token, box_vis_level=BoxVisibility.ANY) calib = np.eye(4, dtype=np.float32) calib[:3, :3] = camera_intrinsic calib = calib[:3] frame_ids[sensor_name] += 1 # image information in COCO format image_info = { "id": num_images, "file_name": image_data["filename"], "calib": calib.tolist(), "video_id": num_videos, "frame_id": frame_ids[sensor_name], "sensor_id": SENSOR_ID[sensor_name], "sample_token": sample["token"], "trans_matrix": trans_matrix.tolist(), "width": sd_record["width"], "height": sd_record["height"], "pose_record_trans": pose_record["translation"], "pose_record_rot": pose_record["rotation"], "cs_record_trans": cs_record["translation"], "cs_record_rot": cs_record["rotation"], } ret["images"].append(image_info) anns = [] for box in boxes: det_name = category_to_detection_name(box.name) if det_name is None: continue num_anns += 1 v = np.dot(box.rotation_matrix, np.array([1, 0, 0])) yaw = -np.arctan2(v[2], v[0]) box.translate(np.array([0, box.wlh[2] / 2, 0])) category_id = CAT_IDS[det_name] amodel_center = project_to_image( np.array( [ box.center[0], box.center[1] - box.wlh[2] / 2, box.center[2], ], np.float32, ).reshape(1, 3), calib, )[0].tolist() sample_ann = nusc.get("sample_annotation", box.token) instance_token = sample_ann["instance_token"] if not (instance_token in track_ids): track_ids[instance_token] = len(track_ids) + 1 attribute_tokens = sample_ann["attribute_tokens"] attributes = [ nusc.get("attribute", att_token)["name"] for att_token in attribute_tokens ] att = "" if len(attributes) == 0 else attributes[0] if len(attributes) > 1: print(attributes) import pdb pdb.set_trace() track_id = track_ids[instance_token] vel = nusc.box_velocity(box.token) # global frame vel = np.dot( np.linalg.inv(trans_matrix), np.array([vel[0], vel[1], vel[2], 0], np.float32), ).tolist() # instance information in COCO format ann = { "id": num_anns, "image_id": num_images, "category_id": category_id, "dim": [box.wlh[2], box.wlh[0], box.wlh[1]], "location": [box.center[0], box.center[1], box.center[2]], "depth": box.center[2], "occluded": 0, "truncated": 0, "rotation_y": yaw, "amodel_center": amodel_center, "iscrowd": 0, "track_id": track_id, "attributes": ATTRIBUTE_TO_ID[att], "velocity": vel, } bbox = KittiDB.project_kitti_box_to_image( copy.deepcopy(box), camera_intrinsic, imsize=(1600, 900)) alpha = _rot_y2alpha( yaw, (bbox[0] + bbox[2]) / 2, camera_intrinsic[0, 2], camera_intrinsic[0, 0], ) ann["bbox"] = [ bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1], ] ann["area"] = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) ann["alpha"] = alpha anns.append(ann) # Filter out bounding boxes outside the image visable_anns = [] for i in range(len(anns)): vis = True for j in range(len(anns)): if anns[i]["depth"] - min(anns[i][ "dim"]) / 2 > anns[j]["depth"] + max( anns[j]["dim"]) / 2 and _bbox_inside( anns[i]["bbox"], anns[j]["bbox"]): vis = False break if vis: visable_anns.append(anns[i]) else: pass for ann in visable_anns: ret["annotations"].append(ann) if DEBUG: img_path = data_path + image_info["file_name"] img = cv2.imread(img_path) img_3d = img.copy() for ann in visable_anns: bbox = ann["bbox"] cv2.rectangle( img, (int(bbox[0]), int(bbox[1])), (int(bbox[2] + bbox[0]), int(bbox[3] + bbox[1])), (0, 0, 255), 3, lineType=cv2.LINE_AA, ) box_3d = compute_box_3d(ann["dim"], ann["location"], ann["rotation_y"]) box_2d = project_to_image(box_3d, calib) img_3d = draw_box_3d(img_3d, box_2d) pt_3d = unproject_2d_to_3d(ann["amodel_center"], ann["depth"], calib) pt_3d[1] += ann["dim"][0] / 2 print("location", ann["location"]) print("loc model", pt_3d) pt_2d = np.array( [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32, ) pt_3d = unproject_2d_to_3d(pt_2d, ann["depth"], calib) pt_3d[1] += ann["dim"][0] / 2 print("loc ", pt_3d) cv2.imshow("img", img) cv2.imshow("img_3d", img_3d) cv2.waitKey() nusc.render_sample_data(image_token) plt.show() print("reordering images") images = ret["images"] video_sensor_to_images = {} for image_info in images: tmp_seq_id = image_info["video_id"] * 20 + image_info["sensor_id"] if tmp_seq_id in video_sensor_to_images: video_sensor_to_images[tmp_seq_id].append(image_info) else: video_sensor_to_images[tmp_seq_id] = [image_info] ret["images"] = [] for tmp_seq_id in sorted(video_sensor_to_images): ret["images"] = ret["images"] + video_sensor_to_images[tmp_seq_id] print("{} {} images {} boxes".format(split, len(ret["images"]), len(ret["annotations"]))) print("out_path", out_path) json.dump(ret, open(out_path, "w"))
def rasterize(ref_sample): obj_box_list = [] obj_shadow_list = [] ref_sample_token = ref_sample["token"] ref_lidar_data = nusc.get("sample_data", ref_sample["data"]["LIDAR_TOP"]) ref_lidar_calib = nusc.get("calibrated_sensor", ref_lidar_data["calibrated_sensor_token"]) ref_lidar_pose = nusc.get("ego_pose", ref_lidar_data["ego_pose_token"]) ref_from_car = transform_matrix(ref_lidar_calib["translation"], Quaternion(ref_lidar_calib["rotation"]), inverse=True) car_from_global = transform_matrix(ref_lidar_pose["translation"], Quaternion(ref_lidar_pose["rotation"]), inverse=True) next_sample_tokens = traverse(nusc, "sample", "next", n_next, ref_sample_token) for curr_sample_token in ([ref_sample_token] + next_sample_tokens): curr_sample = nusc.get("sample", curr_sample_token) curr_sample_boxes = [] for sample_annotation_token in curr_sample["anns"]: sample_annotation = nusc.get("sample_annotation", sample_annotation_token) detection_name = category_to_detection_name(sample_annotation["category_name"]) if detection_name is None: # there are certain categories we will ignore continue # print(sample_annotation["category_name"], detection_name) curr_sample_boxes.append(DetectionBox( sample_token=curr_sample_token, translation=sample_annotation["translation"], size=sample_annotation["size"], rotation=sample_annotation["rotation"], velocity=nusc.box_velocity(sample_annotation["token"])[:2], num_pts=sample_annotation["num_lidar_pts"] + sample_annotation["num_radar_pts"], detection_name=detection_name, )) # NOTE transform boxes to the *reference* frame curr_sample_boxes = boxes_to_sensor(curr_sample_boxes, ref_lidar_pose, ref_lidar_calib) # NOTE object box binary masks curr_obj_box = draw_obj_boxes(curr_sample_boxes, xlim, ylim) obj_box_list.append(curr_obj_box) curr_sample_data = nusc.get("sample_data", curr_sample["data"]["LIDAR_TOP"]) curr_lidar_pose = nusc.get("ego_pose", curr_sample_data["ego_pose_token"]) curr_lidar_calib = nusc.get("calibrated_sensor", curr_sample_data["calibrated_sensor_token"]) global_from_car = transform_matrix(curr_lidar_pose["translation"], Quaternion(curr_lidar_pose["rotation"]), inverse=False) car_from_curr = transform_matrix(curr_lidar_calib["translation"], Quaternion(curr_lidar_calib["rotation"]), inverse=False) ref_from_curr = reduce(np.dot, [ref_from_car, car_from_global, global_from_car, car_from_curr]) _x0, _y0, _z0 = ref_from_curr[:3, 3] curr_obj_shadow = draw_object_shadow(curr_sample_boxes, _x0, _y0, xlim, ylim) obj_shadow_list.append(curr_obj_shadow) obj_boxes = np.array(obj_box_list) obj_shadows = np.array(obj_shadow_list) ref_scene = nusc.get("scene", ref_sample["scene_token"]) ref_log = nusc.get("log", ref_scene["log_token"]) flip_flag = True if ref_log["location"].startswith("singapore") else False if flip_flag: obj_boxes = obj_boxes[:, :, ::-1] obj_shadows = obj_shadows[:, :, ::-1] ref_lidar_token = ref_lidar_data["token"] obj_box_path = f"{obj_box_dir}/{ref_lidar_token}.bin" if not os.path.exists(obj_box_path): obj_boxes.tofile(obj_box_path) obj_shadow_path = f"{obj_shadow_dir}/{ref_lidar_token}.bin" if not os.path.exists(obj_shadow_path): obj_shadows.tofile(obj_shadow_path)
lidar_to_ego = transform_matrix(calib_sensor['translation'], Quaternion(calib_sensor['rotation'])) ego_to_global_prev = transform_matrix(ego_pose_prev['translation'], Quaternion(ego_pose_prev['rotation'])) lidar_to_ego_prev = transform_matrix(calib_sensor_prev['translation'], Quaternion(calib_sensor_prev['rotation'])) lidar_to_global = np.dot(ego_to_global, lidar_to_ego) lidar_to_global_prev = np.dot(ego_to_global_prev, lidar_to_ego_prev) delta = inv(lidar_to_global_prev).dot(lidar_to_global) y_translation.append(delta[1, 3]) delta_angles = rotationMatrixToEulerAngles(delta[0:3, 0:3]) * 180 / np.pi yaw.append(delta_angles[2]) for annotation_token in sample['anns']: annotation_metadata = nusc.get('sample_annotation', annotation_token) if annotation_metadata['num_lidar_pts'] == 0: continue detection_name = category_to_detection_name(annotation_metadata['category_name']) if detection_name is None: continue _, box_lidar, _ = nusc.get_sample_data(sample['data'][sensor], box_vis_level=BoxVisibility.NONE, selected_anntokens=[annotation_token]) box_lidar = box_lidar[0] box_lidar.rotate(nu_to_kitti_lidar) if x_positions[detection_name] is None: x_positions[detection_name] = [box_lidar.center[0]] y_positions[detection_name] = [box_lidar.center[1]] length[detection_name] = [box_lidar.wlh[1]] width[detection_name] = [box_lidar.wlh[0]] num_lidar_points[detection_name] = [annotation_metadata['num_lidar_pts']] else: x_positions[detection_name].append(box_lidar.center[0]) y_positions[detection_name].append(box_lidar.center[1])
def __getitem__(self, idx): # set defaults here kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2) kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse imsize = (1600, 900) # sample_token based on index sample_token = self.sample_tokens[idx] # Get sample data. sample = self.nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] cam_front_token = sample['data'][self.cam_name] lidar_token = sample['data'][self.lidar_name] # Retrieve sensor records. sd_record_cam = self.nusc.get('sample_data', cam_front_token) sd_record_lid = self.nusc.get('sample_data', lidar_token) cs_record_cam = self.nusc.get('calibrated_sensor', sd_record_cam['calibrated_sensor_token']) cs_record_lid = self.nusc.get('calibrated_sensor', sd_record_lid['calibrated_sensor_token']) # Combine transformations and convert to KITTI format. # Note: cam uses same conventions in KITTI and nuScenes. lid_to_ego = transform_matrix(cs_record_lid['translation'], Quaternion(cs_record_lid['rotation']), inverse=False) ego_to_cam = transform_matrix(cs_record_cam['translation'], Quaternion(cs_record_cam['rotation']), inverse=True) velo_to_cam = np.dot(ego_to_cam, lid_to_ego) # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam. velo_to_cam_kitti = np.dot(velo_to_cam, kitti_to_nu_lidar.transformation_matrix) r0_rect = Quaternion(axis=[1, 0, 0], angle=0) # Dummy values. # Projection matrix. p_left_kitti = np.zeros((3, 4)) p_left_kitti[:3, :3] = cs_record_cam[ 'camera_intrinsic'] # Cameras are always rectified. # Create KITTI style transforms. velo_to_cam_rot = velo_to_cam_kitti[:3, :3] velo_to_cam_trans = velo_to_cam_kitti[:3, 3] # Check that the rotation has the same format as in KITTI. assert (velo_to_cam_rot.round(0) == np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]])).all() assert (velo_to_cam_trans[1:3] < 0).all() # Retrieve the token from the lidar. # Note that this may be confusing as the filename of the camera will include the timestamp of the lidar, # not the camera. filename_cam_full = sd_record_cam['filename'] # set the img variable to its data here src_im_path = os.path.join(self.nusc.dataroot, filename_cam_full) img = Image.open(src_im_path) # Create calibration matrix. K = p_left_kitti K = [float(i) for i in K] K = np.array(K, dtype=np.float32).reshape(3, 4) K = K[:3, :3] # populate the list of object annotations for this sample anns = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = self.nusc.get('sample_annotation', sample_annotation_token) # Get box in LIDAR frame. _, box_lidar_nusc, _ = self.nusc.get_sample_data( lidar_token, box_vis_level=BoxVisibility.NONE, selected_anntokens=[sample_annotation_token]) box_lidar_nusc = box_lidar_nusc[0] # Truncated: Set all objects to 0 which means untruncated. truncated = 0.0 # Occluded: Set all objects to full visibility as this information is not available in nuScenes. occluded = 0 # Convert nuScenes category to nuScenes detection challenge category. detection_name = category_to_detection_name( sample_annotation['category_name']) # Skip categories that are not part of the nuScenes detection challenge. if detection_name is None: continue # Convert from nuScenes to KITTI box format. box_cam_kitti = KittiDB.box_nuscenes_to_kitti( box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect) # Project 3d box to 2d box in image, ignore box if it does not fall inside. bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti, p_left_kitti, imsize=imsize) if bbox_2d is None: continue # Set dummy score so we can use this file as result. box_cam_kitti.score = 0 # Convert box to output string format. output = KittiDB.box_to_string(name=detection_name, box=box_cam_kitti, bbox_2d=bbox_2d, truncation=truncated, occlusion=occluded) fieldnames = [ 'type', 'truncated', 'occluded', 'alpha', 'xmin', 'ymin', 'xmax', 'ymax', 'dh', 'dw', 'dl', 'lx', 'ly', 'lz', 'ry' ] if self.is_train: f = StringIO(output) reader = csv.DictReader(f, delimiter=' ', fieldnames=fieldnames) for line, row in enumerate(reader): if row["type"] in self.classes: anns.append({ "class": row["type"], "label": TYPE_ID_CONVERSION[row["type"]], "truncation": float(row["truncated"]), "occlusion": float(row["occluded"]), "alpha": float(row["alpha"]), "dimensions": [ float(row['dl']), float(row['dh']), float(row['dw']) ], "locations": [ float(row['lx']), float(row['ly']), float(row['lz']) ], "rot_y": float(row["ry"]) }) center = np.array([i / 2 for i in img.size], dtype=np.float32) size = np.array([i for i in img.size], dtype=np.float32) """ resize, horizontal flip, and affine augmentation are performed here. since it is complicated to compute heatmap w.r.t transform. """ flipped = False if (self.is_train) and (random.random() < self.flip_prob): flipped = True img = img.transpose(Image.FLIP_LEFT_RIGHT) center[0] = size[0] - center[0] - 1 K[0, 2] = size[0] - K[0, 2] - 1 affine = False if (self.is_train) and (random.random() < self.aug_prob): affine = True shift, scale = self.shift_scale[0], self.shift_scale[1] shift_ranges = np.arange(-shift, shift + 0.1, 0.1) center[0] += size[0] * random.choice(shift_ranges) center[1] += size[1] * random.choice(shift_ranges) scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1) size *= random.choice(scale_ranges) center_size = [center, size] trans_affine = get_transfrom_matrix( center_size, [self.input_width, self.input_height]) trans_affine_inv = np.linalg.inv(trans_affine) img = img.transform( (self.input_width, self.input_height), method=Image.AFFINE, data=trans_affine_inv.flatten()[:6], resample=Image.BILINEAR, ) trans_mat = get_transfrom_matrix( center_size, [self.output_width, self.output_height]) if not self.is_train: # for inference we parametrize with original size target = ParamsList(image_size=size, is_train=self.is_train) target.add_field("trans_mat", trans_mat) target.add_field("K", K) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx heat_map = np.zeros( [self.num_classes, self.output_height, self.output_width], dtype=np.float32) regression = np.zeros([self.max_objs, 3, 8], dtype=np.float32) cls_ids = np.zeros([self.max_objs], dtype=np.int32) proj_points = np.zeros([self.max_objs, 2], dtype=np.int32) p_offsets = np.zeros([self.max_objs, 2], dtype=np.float32) dimensions = np.zeros([self.max_objs, 3], dtype=np.float32) locations = np.zeros([self.max_objs, 3], dtype=np.float32) rotys = np.zeros([self.max_objs], dtype=np.float32) reg_mask = np.zeros([self.max_objs], dtype=np.uint8) flip_mask = np.zeros([self.max_objs], dtype=np.uint8) for i, a in enumerate(anns): a = a.copy() cls = a["label"] locs = np.array(a["locations"]) rot_y = np.array(a["rot_y"]) if flipped: locs[0] *= -1 rot_y *= -1 point, box2d, box3d = encode_label(K, rot_y, a["dimensions"], locs) point = affine_transform(point, trans_mat) box2d[:2] = affine_transform(box2d[:2], trans_mat) box2d[2:] = affine_transform(box2d[2:], trans_mat) box2d[[0, 2]] = box2d[[0, 2]].clip(0, self.output_width - 1) box2d[[1, 3]] = box2d[[1, 3]].clip(0, self.output_height - 1) h, w = box2d[3] - box2d[1], box2d[2] - box2d[0] if (0 < point[0] < self.output_width) and (0 < point[1] < self.output_height): point_int = point.astype(np.int32) p_offset = point - point_int radius = gaussian_radius(h, w) radius = max(0, int(radius)) heat_map[cls] = draw_umich_gaussian(heat_map[cls], point_int, radius) cls_ids[i] = cls regression[i] = box3d proj_points[i] = point_int p_offsets[i] = p_offset dimensions[i] = np.array(a["dimensions"]) locations[i] = locs rotys[i] = rot_y reg_mask[i] = 1 if not affine else 0 flip_mask[i] = 1 if not affine and flipped else 0 target = ParamsList(image_size=img.size, is_train=self.is_train) target.add_field("hm", heat_map) target.add_field("reg", regression) target.add_field("cls_ids", cls_ids) target.add_field("proj_p", proj_points) target.add_field("dimensions", dimensions) target.add_field("locations", locations) target.add_field("rotys", rotys) target.add_field("trans_mat", trans_mat) target.add_field("K", K) target.add_field("reg_mask", reg_mask) target.add_field("flip_mask", flip_mask) if self.transforms is not None: img, target = self.transforms(img, target) return img, target, idx
def main(): SCENE_SPLITS['mini-val'] = SCENE_SPLITS['val'] if not os.path.exists(OUT_PATH): os.mkdir(OUT_PATH) for split in SPLITS: data_path = DATA_PATH + '{}/'.format(SPLITS[split]) nusc = NuScenes( version=SPLITS[split], dataroot=data_path, verbose=True) out_path = OUT_PATH + '{}.json'.format(split) categories_info = [{'name': CATS[i], 'id': i + 1} for i in range(len(CATS))] ret = {'images': [], 'annotations': [], 'categories': categories_info, 'videos': [], 'attributes': ATTRIBUTE_TO_ID} num_images = 0 num_anns = 0 num_videos = 0 # A "sample" in nuScenes refers to a timestamp with 6 cameras and 1 LIDAR. for sample in nusc.sample: scene_name = nusc.get('scene', sample['scene_token'])['name'] if not (split in ['mini', 'test']) and \ not (scene_name in SCENE_SPLITS[split]): continue if sample['prev'] == '': print('scene_name', scene_name) num_videos += 1 ret['videos'].append({'id': num_videos, 'file_name': scene_name}) frame_ids = {k: 0 for k in sample['data']} track_ids = {} # We decompose a sample into 6 images in our case. for sensor_name in sample['data']: if sensor_name in USED_SENSOR: image_token = sample['data'][sensor_name] image_data = nusc.get('sample_data', image_token) num_images += 1 # Complex coordinate transform. This will take time to understand. sd_record = nusc.get('sample_data', image_token) cs_record = nusc.get( 'calibrated_sensor', sd_record['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) global_from_car = transform_matrix(pose_record['translation'], Quaternion(pose_record['rotation']), inverse=False) car_from_sensor = transform_matrix( cs_record['translation'], Quaternion(cs_record['rotation']), inverse=False) trans_matrix = np.dot(global_from_car, car_from_sensor) _, boxes, camera_intrinsic = nusc.get_sample_data( image_token, box_vis_level=BoxVisibility.ANY) calib = np.eye(4, dtype=np.float32) calib[:3, :3] = camera_intrinsic calib = calib[:3] frame_ids[sensor_name] += 1 # image information in COCO format image_info = {'id': num_images, 'file_name': image_data['filename'], 'calib': calib.tolist(), 'video_id': num_videos, 'frame_id': frame_ids[sensor_name], 'sensor_id': SENSOR_ID[sensor_name], 'sample_token': sample['token'], 'trans_matrix': trans_matrix.tolist(), 'width': sd_record['width'], 'height': sd_record['height'], 'pose_record_trans': pose_record['translation'], 'pose_record_rot': pose_record['rotation'], 'cs_record_trans': cs_record['translation'], 'cs_record_rot': cs_record['rotation']} ret['images'].append(image_info) anns = [] for box in boxes: det_name = category_to_detection_name(box.name) if det_name is None: continue num_anns += 1 v = np.dot(box.rotation_matrix, np.array([1, 0, 0])) yaw = -np.arctan2(v[2], v[0]) box.translate(np.array([0, box.wlh[2] / 2, 0])) category_id = CAT_IDS[det_name] amodel_center = project_to_image( np.array([box.center[0], box.center[1] - box.wlh[2] / 2, box.center[2]], np.float32).reshape(1, 3), calib)[0].tolist() sample_ann = nusc.get( 'sample_annotation', box.token) instance_token = sample_ann['instance_token'] if not (instance_token in track_ids): track_ids[instance_token] = len(track_ids) + 1 attribute_tokens = sample_ann['attribute_tokens'] attributes = [nusc.get('attribute', att_token)['name'] \ for att_token in attribute_tokens] att = '' if len(attributes) == 0 else attributes[0] if len(attributes) > 1: print(attributes) import pdb; pdb.set_trace() track_id = track_ids[instance_token] vel = nusc.box_velocity(box.token) # global frame vel = np.dot(np.linalg.inv(trans_matrix), np.array([vel[0], vel[1], vel[2], 0], np.float32)).tolist() # instance information in COCO format ann = { 'id': num_anns, 'image_id': num_images, 'category_id': category_id, 'dim': [box.wlh[2], box.wlh[0], box.wlh[1]], 'location': [box.center[0], box.center[1], box.center[2]], 'depth': box.center[2], 'occluded': 0, 'truncated': 0, 'rotation_y': yaw, 'amodel_center': amodel_center, 'iscrowd': 0, 'track_id': track_id, 'attributes': ATTRIBUTE_TO_ID[att], 'velocity': vel } bbox = KittiDB.project_kitti_box_to_image( copy.deepcopy(box), camera_intrinsic, imsize=(1600, 900)) alpha = _rot_y2alpha(yaw, (bbox[0] + bbox[2]) / 2, camera_intrinsic[0, 2], camera_intrinsic[0, 0]) ann['bbox'] = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] ann['area'] = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) ann['alpha'] = alpha anns.append(ann) # Filter out bounding boxes outside the image visable_anns = [] for i in range(len(anns)): vis = True for j in range(len(anns)): if anns[i]['depth'] - min(anns[i]['dim']) / 2 > \ anns[j]['depth'] + max(anns[j]['dim']) / 2 and \ _bbox_inside(anns[i]['bbox'], anns[j]['bbox']): vis = False break if vis: visable_anns.append(anns[i]) else: pass for ann in visable_anns: ret['annotations'].append(ann) if DEBUG: img_path = data_path + image_info['file_name'] img = cv2.imread(img_path) img_3d = img.copy() for ann in visable_anns: bbox = ann['bbox'] cv2.rectangle(img, (int(bbox[0]), int(bbox[1])), (int(bbox[2] + bbox[0]), int(bbox[3] + bbox[1])), (0, 0, 255), 3, lineType=cv2.LINE_AA) box_3d = compute_box_3d(ann['dim'], ann['location'], ann['rotation_y']) box_2d = project_to_image(box_3d, calib) img_3d = draw_box_3d(img_3d, box_2d) pt_3d = unproject_2d_to_3d(ann['amodel_center'], ann['depth'], calib) pt_3d[1] += ann['dim'][0] / 2 print('location', ann['location']) print('loc model', pt_3d) pt_2d = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) pt_3d = unproject_2d_to_3d(pt_2d, ann['depth'], calib) pt_3d[1] += ann['dim'][0] / 2 print('loc ', pt_3d) cv2.imshow('img', img) cv2.imshow('img_3d', img_3d) cv2.waitKey() nusc.render_sample_data(image_token) plt.show() print('reordering images') images = ret['images'] video_sensor_to_images = {} for image_info in images: tmp_seq_id = image_info['video_id'] * 20 + image_info['sensor_id'] if tmp_seq_id in video_sensor_to_images: video_sensor_to_images[tmp_seq_id].append(image_info) else: video_sensor_to_images[tmp_seq_id] = [image_info] ret['images'] = [] for tmp_seq_id in sorted(video_sensor_to_images): ret['images'] = ret['images'] + video_sensor_to_images[tmp_seq_id] print('{} {} images {} boxes'.format( split, len(ret['images']), len(ret['annotations']))) print('out_path', out_path) json.dump(ret, open(out_path, 'w'))