def generateXYZRGB(ds: LyftDataset, sample_token: str, config: dict, folder: str): sample = ds.get("sample", sample_token) sample_lidar_token = sample["data"]["LIDAR_TOP"] lidar_data = ds.get("sample_data", sample_lidar_token) lidar_filepath = ds.get_sample_data_path(sample_lidar_token) ego_pose = ds.get("ego_pose", lidar_data["ego_pose_token"]) calibrated_sensor = ds.get("calibrated_sensor", lidar_data["calibrated_sensor_token"]) global_from_car = transform_matrix(ego_pose['translation'], Quaternion(ego_pose['rotation']), inverse=False) car_from_sensor = transform_matrix(calibrated_sensor['translation'], Quaternion( calibrated_sensor['rotation']), inverse=False) try: lidar_pointcloud = LidarPointCloud.from_file(lidar_filepath) pc = copy.deepcopy(lidar_pointcloud) lidar_pointcloud.transform(car_from_sensor) except Exception as e: print("Failed to load Lidar Pointcloud for {}: {}:".format( sample_token, e)) pass XYZRGB = getXYZRGB(ds, pc, sample, lidar_data) np.savez(folder + sample_lidar_token + ".xyzrgb")
def transform_bounding_box_to_sensor_coord_and_get_corners(box: Box, sample_data_token: str, lyftd: LyftDataset, frustum_pointnet_convention=False): """ Transform the bounding box to Get the bounding box corners :param box: :param sample_data_token: camera data token :param level5data: :return: """ transformed_box = transform_box_from_world_to_sensor_coordinates(box, sample_data_token, lyftd) sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) if sensor_record['modality'] == 'camera': cam_intrinsic_mtx = np.array(cs_record["camera_intrinsic"]) else: cam_intrinsic_mtx = None box_corners_on_cam_coord = transformed_box.corners() # Regarrange to conform Frustum-pointnet's convention if frustum_pointnet_convention: rearranged_idx = [0, 3, 7, 4, 1, 2, 6, 5] box_corners_on_cam_coord = box_corners_on_cam_coord[:, rearranged_idx] assert np.allclose((box_corners_on_cam_coord[:, 0] + box_corners_on_cam_coord[:, 6]) / 2, np.array(transformed_box.center)) # For perspective transformation, the normalization should set to be True box_corners_on_image = view_points(box_corners_on_cam_coord, view=cam_intrinsic_mtx, normalize=True) return box_corners_on_image, box_corners_on_cam_coord
def transform_box_from_world_to_flat_sensor_coordinates(first_train_sample_box: Box, sample_data_token: str, lyftd: LyftDataset): sample_box = first_train_sample_box.copy() sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # Move box to ego vehicle coord system parallel to world z plane ypr = Quaternion(pose_record["rotation"]).yaw_pitch_roll yaw = ypr[0] sample_box.translate(-np.array(pose_record["translation"])) sample_box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) # Move box to sensor vehicle coord system parallel to world z plane # We need to steps, because camera coordinate is x(right), z(front), y(down) inv_ypr = Quaternion(cs_record["rotation"]).inverse.yaw_pitch_roll angz = inv_ypr[0] angx = inv_ypr[2] sample_box.translate(-np.array(cs_record['translation'])) # rotate around z-axis sample_box.rotate(Quaternion(scalar=np.cos(angz / 2), vector=[0, 0, np.sin(angz / 2)])) # rotate around x-axis (by 90 degrees) angx = 90 sample_box.rotate(Quaternion(scalar=np.cos(angx / 2), vector=[np.sin(angx / 2), 0, 0])) return sample_box
def transform_pc_to_camera_coord(cam: dict, pointsensor: dict, point_cloud_3d: LidarPointCloud, lyftd: LyftDataset): warnings.warn("The point cloud is transformed to camera coordinates in place", UserWarning) # Points live in the point sensor frame. So they need to be transformed via global to the image plane. # First step: transform the point-cloud to the ego vehicle frame for the timestamp of the sweep. cs_record = lyftd.get("calibrated_sensor", pointsensor["calibrated_sensor_token"]) point_cloud_3d.rotate(Quaternion(cs_record["rotation"]).rotation_matrix) point_cloud_3d.translate(np.array(cs_record["translation"])) # Second step: transform to the global frame. poserecord = lyftd.get("ego_pose", pointsensor["ego_pose_token"]) point_cloud_3d.rotate(Quaternion(poserecord["rotation"]).rotation_matrix) point_cloud_3d.translate(np.array(poserecord["translation"])) # Third step: transform into the ego vehicle frame for the timestamp of the image. poserecord = lyftd.get("ego_pose", cam["ego_pose_token"]) point_cloud_3d.translate(-np.array(poserecord["translation"])) point_cloud_3d.rotate(Quaternion(poserecord["rotation"]).rotation_matrix.T) # Fourth step: transform into the camera. cs_record = lyftd.get("calibrated_sensor", cam["calibrated_sensor_token"]) point_cloud_3d.translate(-np.array(cs_record["translation"])) point_cloud_3d.rotate(Quaternion(cs_record["rotation"]).rotation_matrix.T) # Take the actual picture (matrix multiplication with camera-matrix + renormalization). point_cloud_2d = view_points(point_cloud_3d.points[:3, :], np.array(cs_record["camera_intrinsic"]), normalize=True) return point_cloud_3d, point_cloud_2d
def transform_world_to_image_coordinate(word_coord_array, camera_token: str, lyftd: LyftDataset): sd_record = lyftd.get("sample_data", camera_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) cam_intrinsic_mtx = np.array(cs_record["camera_intrinsic"]) # homogeneous coordinate to non-homogeneous one pose_to_sense_rot_mtx = Quaternion(cs_record['rotation']).rotation_matrix.T world_to_pose_rot_mtx = Quaternion(pose_record['rotation']).rotation_matrix.T ego_coord_array = np.dot(world_to_pose_rot_mtx, word_coord_array) t = np.array(pose_record['translation']) for i in range(3): ego_coord_array[i, :] = ego_coord_array[i, :] - t[i] sense_coord_array = np.dot(pose_to_sense_rot_mtx, ego_coord_array) t = np.array(cs_record['translation']) for i in range(3): sense_coord_array[i, :] = sense_coord_array[i, :] - t[i] return view_points(sense_coord_array, cam_intrinsic_mtx, normalize=True)
def create_lyft_infos( root_path=Path("/media/zzhou/data/data-lyft-3D-objects/")): # ==================================================================== # A. Creating an index and splitting into train and validation scenes # ==================================================================== level5data = LyftDataset(data_path=root_path, json_path=root_path + 'train_data', verbose=True) classes = [ "car", "motorcycle", "bus", "bicycle", "truck", "pedestrian", "other_vehicle", "animal", "emergency_vehicle" ] records = [(level5data.get('sample', record['first_sample_token'])['timestamp'], record) for record in level5data.scene] entries = [] for start_time, record in sorted(records): start_time = level5data.get( 'sample', record['first_sample_token'])['timestamp'] / 1000000 token = record['token'] name = record['name'] date = datetime.utcfromtimestamp(start_time) host = "-".join(record['name'].split("-")[:2]) first_sample_token = record["first_sample_token"] entries.append((host, name, date, token, first_sample_token)) df = pd.DataFrame(entries, columns=[ "host", "scene_name", "date", "scene_token", "first_sample_token" ]) validation_hosts = ["host-a007", "host-a008", "host-a009"] validation_df = df[df["host"].isin(validation_hosts)] vi = validation_df.index train_df = df[~df.index.isin(vi)] print(len(train_df), len(validation_df), "train/validation split scene counts") # ==================================================================== # B. build SECOND database # ==================================================================== metadata = { "version": f"{len(train_df)} train / {len(validation_df)} validation split scene counts", } train_lyft_infos = _scan_sample_token(level5data, train_df) data = { "infos": train_lyft_infos, "metadata": metadata, } with open(root_path / "infos_train.pkl", 'wb') as f: pickle.dump(data, f) val_lyft_infos = _scan_sample_token(level5data, validation_df) data["infos"] = val_lyft_infos with open(root_path / "infos_val.pkl", 'wb') as f: pickle.dump(data, f)
def transform_box_from_world_to_ego_coordinates(first_train_sample_box: Box, sample_data_token: str, lyftd: LyftDataset): sample_box = first_train_sample_box.copy() sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # Move box to ego vehicle coord system sample_box.translate(-np.array(pose_record["translation"])) sample_box.rotate(Quaternion(pose_record["rotation"]).inverse) return sample_box
def get_sensor_to_world_transform_matrix_from_sample_data_token(sample_data_token, lyftd: LyftDataset): sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) sensor_to_ego_mtx = transform_matrix(translation=np.array(cs_record["translation"]), rotation=Quaternion(cs_record["rotation"])) ego_to_world_mtx = transform_matrix(translation=np.array(pose_record["translation"]), rotation=Quaternion(pose_record["rotation"])) return np.dot(ego_to_world_mtx, sensor_to_ego_mtx)
def convert_box_to_world_coord_with_sample_data_token(input_sample_box, sample_data_token, lyftd: LyftDataset): sample_box = input_sample_box.copy() sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # Move box from sensor to vehicle ego coord system sample_box.rotate(Quaternion(cs_record["rotation"])) sample_box.translate(np.array(cs_record["translation"])) # Move box from ego vehicle to world coord system sample_box.rotate(Quaternion(pose_record["rotation"])) sample_box.translate(np.array(pose_record["translation"])) return sample_box
def read_frustum_pointnet_output(ldt: LyftDataset, inference_pickle_file, token_pickle_file, from_rgb_detection: bool): with open(inference_pickle_file, 'rb') as fp: ps_list = pickle.load(fp) if not from_rgb_detection: seg_list = pickle.load(fp) segp_list = pickle.load(fp) center_list = pickle.load(fp) heading_cls_list = pickle.load(fp) heading_res_list = pickle.load(fp) size_cls_list = pickle.load(fp) size_res_list = pickle.load(fp) rot_angle_list = pickle.load(fp) score_list = pickle.load(fp) if from_rgb_detection: onehot_list = pickle.load(fp) with open(token_pickle_file, 'rb') as fp: sample_token_list = pickle.load(fp) annotation_token_list = pickle.load(fp) camera_data_token_list = pickle.load(fp) type_list = pickle.load(fp) ldt.get('sample', sample_token_list[0]) if annotation_token_list: ldt.get('sample_annotation', annotation_token_list[0]) print("lengh of sample token:", len(sample_token_list)) print("lengh of ps list:", len(ps_list)) assert len(sample_token_list) == len(ps_list) boxes = [] gt_boxes = [] for data_idx in range(len(ps_list)): inferred_box = get_box_from_inference( lyftd=ldt, heading_cls=heading_cls_list[data_idx], heading_res=heading_res_list[data_idx], rot_angle=rot_angle_list[data_idx], size_cls=size_cls_list[data_idx], size_res=size_res_list[data_idx], center_coord=center_list[data_idx], sample_data_token=camera_data_token_list[data_idx], score=score_list[data_idx]) inferred_box.name = type_list[data_idx] boxes.append(inferred_box) if not from_rgb_detection: gt_boxes.append(ldt.get_box(annotation_token_list[data_idx])) return boxes, gt_boxes, sample_token_list
def transform_box_from_world_to_flat_vehicle_coordinates(first_train_sample_box: Box, sample_data_token: str, lyftd: LyftDataset): sample_box = first_train_sample_box.copy() sd_record = lyftd.get("sample_data", sample_data_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # Move box to ego vehicle coord system parallel to world z plane ypr = Quaternion(pose_record["rotation"]).yaw_pitch_roll yaw = ypr[0] sample_box.translate(-np.array(pose_record["translation"])) sample_box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) return sample_box
def select_annotation_boxes(sample_token, lyftd: LyftDataset, box_vis_level: BoxVisibility = BoxVisibility.ALL, camera_type=['CAM_FRONT', 'CAM_BACK', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK_LEFT']) -> (str, str, Box): """ Select annotations that is a camera image defined by box_vis_level :param sample_token: :param box_vis_level:BoxVisbility.ALL or BoxVisibility.ANY :param camera_type: a list of camera that the token should be selected from :return: yield (sample_token,cam_token, Box) """ sample_record = lyftd.get('sample', sample_token) cams = [key for key in sample_record["data"].keys() if "CAM" in key] cams = [cam for cam in cams if cam in camera_type] for ann_token in sample_record['anns']: for cam in cams: cam_token = sample_record["data"][cam] _, boxes_in_sensor_coord, cam_intrinsic = lyftd.get_sample_data( cam_token, box_vis_level=box_vis_level, selected_anntokens=[ann_token] ) if len(boxes_in_sensor_coord) > 0: assert len(boxes_in_sensor_coord) == 1 box_in_world_coord = lyftd.get_box(boxes_in_sensor_coord[0].token) yield sample_token, cam_token, box_in_world_coord
def convert_box_to_world_coord(box: Box, sample_token, sensor_type, lyftd: LyftDataset): sample_box = box.copy() sample_record = lyftd.get('sample', sample_token) sample_data_token = sample_record['data'][sensor_type] converted_sample_box = convert_box_to_world_coord_with_sample_data_token(sample_box, sample_data_token) return converted_sample_box
def transform_image_to_world_coordinate(image_array: np.array, camera_token: str, lyftd: LyftDataset): """ :param image_array: 3xN np.ndarray :param camera_token: :return: """ sd_record = lyftd.get("sample_data", camera_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # inverse the viewpoint transformation cam_intrinsic_mtx = np.array(cs_record["camera_intrinsic"]) image_in_cam_coord = np.dot(np.linalg.inv(cam_intrinsic_mtx), image_array) print(image_in_cam_coord) # TODO: think of how to do normalization properly # image_in_cam_coord = image_in_cam_coord / image_in_cam_coord[3:].ravel() # homogeneous coordinate to non-homogeneous one image_in_cam_coord = image_in_cam_coord[0:3, :] sens_to_pose_rot_mtx = Quaternion(cs_record['rotation']).rotation_matrix image_in_pose_coord = np.dot(sens_to_pose_rot_mtx, image_in_cam_coord) t = np.array(cs_record['translation']) for i in range(3): image_in_pose_coord[i, :] = image_in_cam_coord[i, :] + t[i] print(cs_record) print("in pose record:", image_in_pose_coord) pose_to_world_rot_mtx = Quaternion(pose_record['rotation']).rotation_matrix image_in_world_coord = np.dot(pose_to_world_rot_mtx, image_in_pose_coord) t = np.array(pose_record['translation']) for i in range(3): image_in_world_coord[i, :] = image_in_world_coord[i, :] + t[i] return image_in_world_coord
def extract_single_box(train_objects, idx, lyftd: LyftDataset) -> Box: first_train_id, first_train_sample_box = get_train_data_sample_token_and_box(idx, train_objects) first_train_sample = lyftd.get('sample', first_train_id) sample_data_token = first_train_sample['data']['LIDAR_TOP'] first_train_sample_box = transform_box_from_world_to_sensor_coordinates(first_train_sample_box, sample_data_token, lyftd) return first_train_sample_box, sample_data_token
def transform_image_to_cam_coordinate(image_array_p: np.array, camera_token: str, lyftd: LyftDataset): sd_record = lyftd.get("sample_data", camera_token) cs_record = lyftd.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) sensor_record = lyftd.get("sensor", cs_record["sensor_token"]) pose_record = lyftd.get("ego_pose", sd_record["ego_pose_token"]) # inverse the viewpoint transformation def normalization(input_array): input_array[0:2, :] = input_array[0:2, :] * input_array[2:3, :].repeat(2, 0).reshape(2, input_array.shape[1]) return input_array image_array = normalization(np.copy(image_array_p)) image_array = np.concatenate((image_array.ravel(), np.array([1]))) image_array = image_array.reshape(4, 1) cam_intrinsic_mtx = np.array(cs_record["camera_intrinsic"]) view = cam_intrinsic_mtx viewpad = np.eye(4) viewpad[: view.shape[0], : view.shape[1]] = view image_in_cam_coord = np.dot(np.linalg.inv(viewpad), image_array) return image_in_cam_coord[0:3, :]
def read_frustum_pointnet_output_v2(ldt: LyftDataset, inference_tfrecord_file): raw_dataset = tf.data.TFRecordDataset(inference_tfrecord_file) for raw_record in raw_dataset: example = parse_inference_record(raw_record) inferred_box = get_box_from_inference( lyftd=ldt, heading_cls=example['rot_heading_angle_class'].numpy(), heading_res=example['rot_heading_angle_residual'].numpy(), rot_angle=example['frustum_angle'].numpy(), size_cls=example['size_class'].numpy(), size_res=example['size_residual'].numpy(), center_coord=example['rot_box_center'].numpy(), sample_data_token=example['camera_token'].numpy().decode('utf8'), score=example['score'].numpy(), type_name=example['type_name'].numpy().decode('utf8')) pc_record = ldt.get("sample_data", example['camera_token'].numpy().decode('utf8')) sample_of_pc_record = ldt.get("sample", pc_record['sample_token']) yield inferred_box, pc_record['sample_token']
def select_annotation_boxes(sample_token, lyftd: LyftDataset, box_vis_level: BoxVisibility = BoxVisibility.ALL, camera_type=['CAM_FRONT', 'CAM_BACK','CAM_FRONT_LEFT','CAM_FRONT_RIGHT','CAM_BACK_RIGHT','CAM_BACK_LEFT']) -> (str, str, Box): """ Select annotations that is a camera image defined by box_vis_level :param sample_token: :param box_vis_level:BoxVisbility.ALL or BoxVisibility.ANY :param camera_type: a list of camera that the token should be selected from :return: yield (sample_token,cam_token, Box) """ sample_record = lyftd.get('sample', sample_token) cams = [key for key in sample_record["data"].keys() if "CAM" in key] cams = [cam for cam in cams if cam in camera_type] for cam in cams: # This step selects all the annotations in a camera image that matches box_vis_level cam_token = sample_record["data"][cam] image_filepath, boxes_in_sensor_coord, cam_intrinsic = lyftd.get_sample_data( cam_token, box_vis_level=box_vis_level, selected_anntokens=sample_record['anns'] ) sd_record = lyftd.get('sample_data', cam_token) img_width = sd_record['width'] # typically 1920 img_height = sd_record['height'] # typically 1080 CORNER_NUMBER = 4 corner_list = np.empty((len(boxes_in_sensor_coord), CORNER_NUMBER)) for idx, box_in_sensor_coord in enumerate(boxes_in_sensor_coord): # For perspective transformation, the normalization should set to be True box_corners_on_image = view_points(box_in_sensor_coord.corners(), view=cam_intrinsic, normalize=True) corners_2d = get_2d_corners_from_projected_box_coordinates(box_corners_on_image) corner_list[idx, :] = corners_2d yield image_filepath, cam_token, corner_list, boxes_in_sensor_coord, img_width, img_height
def get_all_image_paths_in_single_scene(scene_number, ldf: LyftDataset): start_sample_token = ldf.scene[scene_number]['first_sample_token'] sample_token = start_sample_token counter = 0 while sample_token != "": if counter % 10 == 0: logging.info("Processing {} token {}".format(scene_number, counter)) counter += 1 sample_record = ldf.get('sample', sample_token) fg = FrustumGenerator(sample_token, ldf) for image_path in fg.generate_image_paths(): yield image_path next_sample_token = sample_record['next'] sample_token = next_sample_token
def get_all_boxes_in_single_scene(scene_number, from_rgb_detection, ldf: LyftDataset): results = [] start_sample_token = ldf.scene[scene_number]['first_sample_token'] sample_token = start_sample_token while sample_token != "": sample_record = ldf.get('sample', sample_token) if not from_rgb_detection: for tks in select_annotation_boxes(sample_token, lyftd=ldf): results.append(tks) else: for tks in select_2d_annotation_boxes(ldf, classifier=tlc, sample_token=sample_token): results.append(tks) next_sample_token = sample_record['next'] sample_token = next_sample_token return results
def select_2d_annotation_boxes(ldf: LyftDataset, classifier, sample_token, camera_type=['CAM_FRONT', 'CAM_BACK', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK_LEFT']) -> (str, str, np.ndarray): sample_record = ldf.get('sample', sample_token) cams = [key for key in sample_record["data"].keys() if "CAM" in key] cams = [cam for cam in cams if cam in camera_type] for cam in cams: cam_token = sample_record["data"][cam] image_file_path = ldf.get_sample_data_path(cam_token) image_array = imread(image_file_path) det_result = classifier.detect_multi_object(image_array, score_threshold=[0.4, 0.4, 0.4]) for i in range(det_result.shape[0]): bounding_2d_box = det_result[i, 0:4] score = det_result[i, 4] class_idx = det_result[i, 5] yield sample_token, cam_token, bounding_2d_box, score, class_idx
def get_all_boxes_in_single_scene(scene_number, from_rgb_detection, ldf: LyftDataset, use_multisweep, object_classifier=None): start_sample_token = ldf.scene[scene_number]['first_sample_token'] sample_token = start_sample_token counter = 0 while sample_token != "": if counter % 10 == 0: logging.info("Processing {} token {}".format(scene_number, counter)) counter += 1 sample_record = ldf.get('sample', sample_token) fg = FrustumGenerator(sample_token, ldf, use_multisweep=use_multisweep) if not from_rgb_detection: for fp in fg.generate_frustums(): yield fp else: # reserved for rgb detection data for fp in fg.generate_frustums_from_2d(object_classifier): yield fp next_sample_token = sample_record['next'] sample_token = next_sample_token
class LyftDataClass(Dataset): def __init__(self, data_path, json_path, verbose=True, map_resolution=0.1): self._lyftdataset = LyftDataset(data_path, json_path, verbose=verbose, map_resolution=map_resolution) def __len__(self): return len(self._lyftdataset.scene) def __getitem__(self, idx): sample_token = self._lyftdataset.sample[idx] sample_record = self.getFromLyft("sample", sample_token) return Sample(self, sample_record) def getFromLyft(self, table_name, token): return self._lyftdataset.get(table_name, token) def getFromLyftDatapath(self): return self._lyftdataset.data_path def getFromLyftBoxes(self, sample_data_token): return self._lyftdataset.getBoxes(sample_data_token)
def project_point_clouds_to_image(camera_token: str, pointsensor_token: str, lyftd: LyftDataset, use_multisweep=False): """ :param camera_token: :param pointsensor_token: :return: (image array, transformed 3d point cloud to camera coordinate, 2d point cloud array) """ cam = lyftd.get("sample_data", camera_token) pointsensor = lyftd.get("sample_data", pointsensor_token) pcl_path = lyftd.data_path / pointsensor["filename"] assert pointsensor["sensor_modality"] == "lidar" if use_multisweep: sample_of_pc_record = lyftd.get("sample", cam['sample_token']) pc, _ = LidarPointCloud.from_file_multisweep(lyftd, sample_of_pc_record, chan='LIDAR_TOP', ref_chan='LIDAR_TOP', num_sweeps=5) else: pc = LidarPointCloud.from_file(pcl_path) im = Image.open(str(lyftd.data_path / cam["filename"])) # Points live in the point sensor frame. So they need to be transformed via global to the image plane. # First step: transform the point-cloud to the ego vehicle frame for the timestamp of the sweep. cs_record = lyftd.get("calibrated_sensor", pointsensor["calibrated_sensor_token"]) pc.rotate(Quaternion(cs_record["rotation"]).rotation_matrix) pc.translate(np.array(cs_record["translation"])) # Second step: transform to the global frame. poserecord = lyftd.get("ego_pose", pointsensor["ego_pose_token"]) pc.rotate(Quaternion(poserecord["rotation"]).rotation_matrix) pc.translate(np.array(poserecord["translation"])) # Third step: transform into the ego vehicle frame for the timestamp of the image. poserecord = lyftd.get("ego_pose", cam["ego_pose_token"]) pc.translate(-np.array(poserecord["translation"])) pc.rotate(Quaternion(poserecord["rotation"]).rotation_matrix.T) # Fourth step: transform into the camera. cs_record = lyftd.get("calibrated_sensor", cam["calibrated_sensor_token"]) pc.translate(-np.array(cs_record["translation"])) pc.rotate(Quaternion(cs_record["rotation"]).rotation_matrix.T) # Take the actual picture (matrix multiplication with camera-matrix + renormalization). point_cloud_2d = view_points(pc.points[:3, :], np.array(cs_record["camera_intrinsic"]), normalize=True) return im, pc, point_cloud_2d
# fit model history = model.fit(x=trainPoints, y=[outClass, outRegress], batch_size=1, verbose=1, epochs=1, steps_per_epoch=180) print(history.history) model.save(save_path) if __name__ == '__main__': # load dataset level5Data = LyftDataset( data_path= 'E:\\CS539 Machine Learning\\3d-object-detection-for-autonomous-vehicles', json_path= 'E:\\CS539 Machine Learning\\3d-object-detection-for-autonomous-vehicles\\train_data', verbose=True) save_path = 'C:\\Users\\snkim\\Desktop\\poject\\models\\180SampleEpoch0.h5' # select the samples you want, then call the train function. samples = [] for scene in level5Data.scene: samples.append(level5Data.get('sample', scene['first_sample_token'])) print('Training on ' + str(len(samples))) train(samples[:], level5Data, save_path)
## load data, you only have to do it once DATA_PATH = r'C:\Users\storm\Documents\3d-object-detection-for-autonomous-vehicles\train_data' + os.sep train = pd.read_csv(DATA_PATH + 'train.csv') sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv') lyftdata = LyftDataset(data_path=DATA_PATH, json_path=DATA_PATH + os.sep + 'data') ## show point cloud interactively cat_token = lyftdata.category[0]['token'] train = pd.read_csv(DATA_PATH + os.sep + 'train.csv') token0 = train.iloc[0]['Id'] my_sample = lyftdata.get('sample', token0) lyftdata.render_sample_3d_interactive(my_sample['token'], render_sample=False) ## sensor information sensor = 'CAM_FRONT' cam_front = lyftdata.get('sample_data', my_sample['data'][sensor]) img = Image.open(DATA_PATH + cam_front['filename']) lyftdata.render_sample_data(cam_front['token'], with_anns=False) ## look at the LIDAR data associated with my_sample lidar_top = lyftdata.get( 'sample_data', my_sample['data']['LIDAR_TOP']) # selecting LIDAR_TOP out of all LIDARs pc = LidarPointCloud.from_file(Path(DATA_PATH + lidar_top['filename']))
class LyftLEVEL5_utils(): def __init__(self, phase): assert phase in ['train', 'test'] self.phase = phase self.prep_list_of_sessions() def load_PC(self, session_ind, cloud_ind, cache_file=None): assert session_ind < self.num_sessions assert cloud_ind < self.session_lengths[ session_ind], f"Requested cloud {cloud_ind}, but session {session_ind} only contains {self.session_lengths[session_ind]} clouds" lidar_token = self.cloud_tokens[session_ind][cloud_ind] cloud = self.load_cloud_raw(lidar_token) if cache_file is not None: np.save(cache_file, cloud) return cloud def get_relative_motion_A_to_B(self, session_ind, cloud_ind_A, cloud_ind_B): token_A = self.cloud_tokens[session_ind][cloud_ind_A] posA = self.load_position_raw(token_A) token_B = self.cloud_tokens[session_ind][cloud_ind_B] posB = self.load_position_raw(token_B) mot_A_to_B = np.linalg.inv(posB) @ posA return mot_A_to_B def prep_list_of_sessions(self): self.level5data = LyftDataset(json_path=DATASET_ROOT + "/%s_data" % self.phase, data_path=DATASET_ROOT, verbose=True) self.num_sessions = len(self.level5data.scene) self.cloud_tokens = [] self.session_lengths = [] for session_ind in range(self.num_sessions): record = self.level5data.scene[session_ind] session_token = record['token'] sample_token = record["first_sample_token"] sample = self.level5data.get("sample", sample_token) lidar_token = sample["data"]["LIDAR_TOP"] cur_lidar_tokens = [] while len(lidar_token) > 0: cur_lidar_tokens.append(lidar_token) lidar_data = self.level5data.get("sample_data", lidar_token) lidar_token = lidar_data["next"] self.cloud_tokens.append(cur_lidar_tokens) self.session_lengths.append(len(cur_lidar_tokens)) def load_position_raw(self, sample_lidar_token): lidar_data = self.level5data.get("sample_data", sample_lidar_token) ego_pose = self.level5data.get("ego_pose", lidar_data["ego_pose_token"]) calibrated_sensor = self.level5data.get( "calibrated_sensor", lidar_data["calibrated_sensor_token"]) # Homogeneous transformation matrix from car frame to world frame. global_from_car = transform_matrix(ego_pose['translation'], Quaternion(ego_pose['rotation']), inverse=False) return global_from_car def load_cloud_raw(self, sample_lidar_token): lidar_data = self.level5data.get("sample_data", sample_lidar_token) lidar_filepath = self.level5data.get_sample_data_path( sample_lidar_token) ego_pose = self.level5data.get("ego_pose", lidar_data["ego_pose_token"]) calibrated_sensor = self.level5data.get( "calibrated_sensor", lidar_data["calibrated_sensor_token"]) # Homogeneous transformation matrix from sensor coordinate frame to ego car frame. car_from_sensor = transform_matrix(calibrated_sensor['translation'], Quaternion( calibrated_sensor['rotation']), inverse=False) lidar_pointcloud = LidarPointCloud.from_file(lidar_filepath) # The lidar pointcloud is defined in the sensor's reference frame. # We want it in the car's reference frame, so we transform each point lidar_pointcloud.transform(car_from_sensor) return lidar_pointcloud.points[:3, :].T
point_cloud[0, i], point_cloud[1, i], point_cloud[2, i], input_label[i], input_object[i] count += 1 return input_data # Read the metadata from the dataset train = pd.read_csv('./3d-object-detection-for-autonomous-vehicles/train.csv') # Save the data to predefined path count = 0 for i in range(1): if i % 5 == 0 or i == train.shape[0]: print("Starting %dth sample... at %.4f%%" % (i, i / train.shape[0])) token = train.iloc[i]['Id'] my_sample = lyftdata.get('sample', token) # Three different LiDAR datasets for one scene if 'LIDAR_TOP' in my_sample['data']: temp = lyftdata.get('sample_data', my_sample['data']['LIDAR_TOP']) lidar_token = my_sample['data']['LIDAR_TOP'] a, b, c = get_data(lidar_token) print(b) d = add_label(a, b, c) path = './traindata/train_' + str(count) np.save(path, d) count += 1 if 'LIDAR_FRONT_LEFT' in my_sample['data']: temp = lyftdata.get('sample_data', my_sample['data']['LIDAR_FRONT_LEFT']) lidar_token = my_sample['data']['LIDAR_FRONT_LEFT']
class KittiConverter: def __init__(self, store_dir: str = "~/lyft_kitti/train/"): """ Args: store_dir: Where to write the KITTI-style annotations. """ self.store_dir = Path(store_dir).expanduser() # Create store_dir. if not self.store_dir.is_dir(): self.store_dir.mkdir(parents=True) def nuscenes_gt_to_kitti( self, lyft_dataroot: str, table_folder: str, store_dataroot: str, lidar_name: str = "LIDAR_TOP", get_all_detections: bool = False, parallel_n_jobs: int = 8, samples_count: Optional[int] = None, ) -> None: """Converts nuScenes GT formatted annotations to KITTI format. Args: lyft_dataroot: folder with tables (json files). table_folder: folder with tables (json files). lidar_name: Name of the lidar sensor. Only one lidar allowed at this moment. get_all_detections: If True, will write all bboxes in PointCloud and use only FrontCamera. parallel_n_jobs: Number of threads to parralel processing. samples_count: Number of samples to convert. """ self.lyft_dataroot = lyft_dataroot self.table_folder = table_folder self.lidar_name = lidar_name self.get_all_detections = get_all_detections self.samples_count = samples_count self.parallel_n_jobs = parallel_n_jobs self.store_dir = Path(store_dataroot) if not self.store_dir.is_dir(): self.store_dir.mkdir(parents=True) # Select subset of the data to look at. self.lyft_ds = LyftDataset(self.lyft_dataroot, self.table_folder) self.kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi) self.kitti_to_nu_lidar_inv = self.kitti_to_nu_lidar.inverse # Get assignment of scenes to splits. split_logs = [ self.lyft_ds.get("log", scene["log_token"])["logfile"] for scene in self.lyft_ds.scene ] if self.get_all_detections: self.cams_to_see = ["CAM_FRONT"] else: self.cams_to_see = [ "CAM_FRONT", "CAM_FRONT_LEFT", "CAM_FRONT_RIGHT", "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT", ] # Create output folders. self.label_folder = self.store_dir.joinpath("label_2") self.calib_folder = self.store_dir.joinpath("calib") self.image_folder = self.store_dir.joinpath("image_2") self.lidar_folder = self.store_dir.joinpath("velodyne") for folder in [ self.label_folder, self.calib_folder, self.image_folder, self.lidar_folder ]: if not folder.is_dir(): folder.mkdir(parents=True) # Use only the samples from the current split. sample_tokens = self._split_to_samples(split_logs) if self.samples_count is not None: sample_tokens = sample_tokens[:self.samples_count] with parallel_backend("threading", n_jobs=self.parallel_n_jobs): Parallel()(delayed(self.process_token_to_kitti)(sample_token) for sample_token in tqdm(sample_tokens)) def process_token_to_kitti(self, sample_token: str) -> None: # Get sample data. sample = self.lyft_ds.get("sample", sample_token) sample_annotation_tokens = sample["anns"] lidar_token = sample["data"][self.lidar_name] sd_record_lid = self.lyft_ds.get("sample_data", lidar_token) cs_record_lid = self.lyft_ds.get( "calibrated_sensor", sd_record_lid["calibrated_sensor_token"]) for cam_name in self.cams_to_see: cam_front_token = sample["data"][cam_name] if self.get_all_detections: token_to_write = sample_token else: token_to_write = cam_front_token # Retrieve sensor records. sd_record_cam = self.lyft_ds.get("sample_data", cam_front_token) cs_record_cam = self.lyft_ds.get( "calibrated_sensor", sd_record_cam["calibrated_sensor_token"]) cam_height = sd_record_cam["height"] cam_width = sd_record_cam["width"] imsize = (cam_width, cam_height) # Combine transformations and convert to KITTI format. # Note: cam uses same conventions in KITTI and nuScenes. lid_to_ego = transform_matrix(cs_record_lid["translation"], Quaternion( cs_record_lid["rotation"]), inverse=False) ego_to_cam = transform_matrix(cs_record_cam["translation"], Quaternion( cs_record_cam["rotation"]), inverse=True) velo_to_cam = np.dot(ego_to_cam, lid_to_ego) # Convert from KITTI to nuScenes LIDAR coordinates, where we apply velo_to_cam. velo_to_cam_kitti = np.dot( velo_to_cam, self.kitti_to_nu_lidar.transformation_matrix) # Currently not used. imu_to_velo_kitti = np.zeros((3, 4)) # Dummy values. r0_rect = Quaternion(axis=[1, 0, 0], angle=0) # Dummy values. # Projection matrix. p_left_kitti = np.zeros((3, 4)) # Cameras are always rectified. p_left_kitti[:3, :3] = cs_record_cam["camera_intrinsic"] # Create KITTI style transforms. velo_to_cam_rot = velo_to_cam_kitti[:3, :3] velo_to_cam_trans = velo_to_cam_kitti[:3, 3] # Check that the rotation has the same format as in KITTI. if self.lyft_ds.get( "sensor", cs_record_cam["sensor_token"])["channel"] == "CAM_FRONT": expected_kitti_velo_to_cam_rot = np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) assert ( velo_to_cam_rot.round(0) == expected_kitti_velo_to_cam_rot ).all(), velo_to_cam_rot.round(0) assert (velo_to_cam_trans[1:3] < 0).all() # Retrieve the token from the lidar. # Note that this may be confusing as the filename of the camera will # include the timestamp of the lidar, # not the camera. filename_cam_full = sd_record_cam["filename"] filename_lid_full = sd_record_lid["filename"] # Convert image (jpg to png). src_im_path = self.lyft_ds.data_path.joinpath(filename_cam_full) dst_im_path = self.image_folder.joinpath(f"{token_to_write}.png") if not dst_im_path.exists(): im = Image.open(src_im_path) im.save(dst_im_path, "PNG") # Convert lidar. # Note that we are only using a single sweep, instead of the commonly used n sweeps. src_lid_path = self.lyft_ds.data_path.joinpath(filename_lid_full) dst_lid_path = self.lidar_folder.joinpath(f"{token_to_write}.bin") pcl = LidarPointCloud.from_file(Path(src_lid_path)) # In KITTI lidar frame. pcl.rotate(self.kitti_to_nu_lidar_inv.rotation_matrix) with open(dst_lid_path, "w") as lid_file: pcl.points.T.tofile(lid_file) # Add to tokens. # tokens.append(token_to_write) # Create calibration file. kitti_transforms = dict() kitti_transforms["P0"] = np.zeros((3, 4)) # Dummy values. kitti_transforms["P1"] = np.zeros((3, 4)) # Dummy values. kitti_transforms["P2"] = p_left_kitti # Left camera transform. kitti_transforms["P3"] = np.zeros((3, 4)) # Dummy values. # Cameras are already rectified. kitti_transforms["R0_rect"] = r0_rect.rotation_matrix kitti_transforms["Tr_velo_to_cam"] = np.hstack( (velo_to_cam_rot, velo_to_cam_trans.reshape(3, 1))) kitti_transforms["Tr_imu_to_velo"] = imu_to_velo_kitti calib_path = self.calib_folder.joinpath(f"{token_to_write}.txt") with open(calib_path, "w") as calib_file: for (key, val) in kitti_transforms.items(): val = val.flatten() val_str = "%.12e" % val[0] for v in val[1:]: val_str += " %.12e" % v calib_file.write("%s: %s\n" % (key, val_str)) # Write label file. label_path = self.label_folder.joinpath(f"{token_to_write}.txt") if label_path.exists(): print("Skipping existing file: %s" % label_path) continue with open(label_path, "w") as label_file: for sample_annotation_token in sample_annotation_tokens: sample_annotation = self.lyft_ds.get( "sample_annotation", sample_annotation_token) # Get box in LIDAR frame. _, box_lidar_nusc, _ = self.lyft_ds.get_sample_data( lidar_token, box_vis_level=BoxVisibility.NONE, selected_anntokens=[sample_annotation_token]) box_lidar_nusc = box_lidar_nusc[0] # Truncated: Set all objects to 0 which means untruncated. truncated = 0.0 # Occluded: Set all objects to full visibility as this information is # not available in nuScenes. occluded = 0 detection_name = sample_annotation["category_name"] # Convert from nuScenes to KITTI box format. box_cam_kitti = KittiDB.box_nuscenes_to_kitti( box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect) # Project 3d box to 2d box in image, ignore box if it does not fall inside. bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti, p_left_kitti, imsize=imsize) if bbox_2d is None and not self.get_all_detections: continue elif bbox_2d is None and self.get_all_detections: # default KITTI bbox bbox_2d = (-1.0, -1.0, -1.0, -1.0) # Set dummy score so we can use this file as result. box_cam_kitti.score = 0 # Convert box to output string format. output = KittiDB.box_to_string( name=detection_name, box=box_cam_kitti, bbox_2d=bbox_2d, truncation=truncated, occlusion=occluded, ) # Write to disk. label_file.write(output + "\n") def render_kitti(self, render_2d: bool = False, store_dataroot: str = "~/lyft_kitti/train/"): """Renders the annotations in the KITTI dataset from a lidar and a camera view. Args: render_2d: Whether to render 2d boxes (only works for camera data). Returns: """ if render_2d: print("Rendering 2d boxes from KITTI format") else: print("Rendering 3d boxes projected from 3d KITTI format") self.store_dir = Path(store_dataroot) # Load the KITTI dataset. kitti = KittiDB(root=self.store_dir) # Create output folder. render_dir = self.store_dir.joinpath("render") if not render_dir.is_dir(): render_dir.mkdir(parents=True) # Render each image. tokens = kitti.tokens i = 0 # currently supports only single thread processing for token in tqdm(tokens): for sensor in ["lidar", "camera"]: out_path = render_dir.joinpath(f"{token}_{sensor}.png") kitti.render_sample_data(token, sensor_modality=sensor, out_path=out_path, render_2d=render_2d) # Close the windows to avoid a warning of too many open windows. plt.close() if (i > 10): break i += 1 def _split_to_samples(self, split_logs: List[str]) -> List[str]: """Convenience function to get the samples in a particular split. Args: split_logs: A list of the log names in this split. Returns: The list of samples. """ samples = [] for sample in self.lyft_ds.sample: scene = self.lyft_ds.get("scene", sample["scene_token"]) log = self.lyft_ds.get("log", scene["log_token"]) logfile = log["logfile"] if logfile in split_logs: samples.append(sample["token"]) return samples
# 里面可视化了所有的文件里面包含的信息 各个json可以看作数据库中的表,通过主键 外键 进行关联 classes = [ "car", "motorcycle", "bus", "bicycle", "truck", "pedestrian", "other_vehicle", "animal", "emergency_vehicle" ] print(level5data.scene[1]) # https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110257#latest-636505 可以看到scene的结构 以及包含的键 与变量 与输出吻合 # {'description': '', 'first_sample_token': '3b30673b9d944ec6058ef5a8debb4c0a6fe075bca7076e06cf42a2bc10dc446e', 'log_token': '0a6839d6ee6804113bb5591ed99cc70ad883d0cff396e3aec5e76e718771b30e', 'name': 'host-a006-lidar0-1236037883198113706-1236037908098879296', 'token': '0a6839d6ee6804113bb5591ed99cc70ad883d0cff396e3aec5e76e718771b30e', 'last_sample_token': '5ec01e4634ca91751311eaafb45a9196ba8616bf05edc85593aff158db653a34', 'nbr_samples': 126} # get Returns a record from table ------- sample 是 表名,对应sample.json # 通过scene 当中的 first_sample_token 得到 timestamp 与 record (scene) 一起放到一个列表里 records = [(level5data.get('sample', record['first_sample_token'])['timestamp'], record) for record in level5data.scene] print(len(records)) # 180 180 个scene print(records[:2]) # [(1557858039302414.8, {'log_token': 'da4ed9e02f64c544f4f1f10c6738216dcb0e6b0d50952e158e5589854af9f100', 'first_sample_token': '24b0962e44420e6322de3f25d9e4e5cc3c7a348ec00bfa69db21517e4ca92cc8', 'name': 'host-a101-lidar0-1241893239199111666-1241893264098084346', 'description': '', 'last_sample_token': '2346756c83f6ae8c4d1adec62b4d0d31b62116d2e1819e96e9512667d15e7cec', 'nbr_samples': 126, 'token': 'da4ed9e02f64c544f4f1f10c6738216dcb0e6b0d50952e158e5589854af9f100'}), # (1552002683300887.5, {'description': '', 'first_sample_token': '3b30673b9d944ec6058ef5a8debb4c0a6fe075bca7076e06cf42a2bc10dc446e', 'log_token': '0a6839d6ee6804113bb5591ed99cc70ad883d0cff396e3aec5e76e718771b30e', 'name': 'host-a006-lidar0-1236037883198113706-1236037908098879296', 'token': '0a6839d6ee6804113bb5591ed99cc70ad883d0cff396e3aec5e76e718771b30e', 'last_sample_token': '5ec01e4634ca91751311eaafb45a9196ba8616bf05edc85593aff158db653a34', 'nbr_samples': 126})] # 下面是看一下train.csv 中间包含638179个已经标定好的样本 即一张图片中可能有多个object 每个object对应一行 # train = pd.read_csv(DATA_PATH + 'train.csv') # sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv') # # # Group data by object category #