def __init__(self, resolution=(320, 240), near=0.01, far=10, client_id=0): assert client_id >= 0, 'Please provide a client id (0 by default)' h, w = min(resolution), max(resolution) self._client_id = client_id self._near = near self._far = far self._shape = (h, w) self._rgba = np.zeros(self._shape + (4, ), dtype=np.uint8) self._mask = np.zeros(self._shape, dtype=np.uint8) self._depth = np.zeros(self._shape, dtype=np.float32) self._render_options = dict() self._render_flags = 0 self.mask_link_index(True) self.casts_shadow(True) # Transform between standard camera coordinate (z forward) and OPENGL camera coordinate system wxyz = transforms3d.euler.euler2quat(np.pi / 2, 0, 0, axes='rxyz') xyzw = [*wxyz[1:], wxyz[0]] self.TCCGL = Transform(xyzw, (0, 0, 0)) # Set some default parameters self.set_extrinsic_bullet(target=(0, 0, 0), distance=1.6, yaw=90, pitch=-35, roll=0) self.set_intrinsic_fov(90)
def get_state(self): obs = dict() # Get images rgba, mask_uint8, mask_int, depth = self._shot() rgb = rgba[..., :3] obs.update(rgb=rgb, mask=mask_uint8, depth=depth, mask_int=mask_int) # Get intrinsic, extrinsic parameters with standard conventions. h, w = self._shape if self._K is not None: K = self._K else: K = K_from_fov(self._proj_params['fov']) trans = self._view_params['target'] orn = euler2quat([ self._view_params[k] * np.pi / 180 for k in ('pitch', 'roll', 'yaw') ], axes='sxyz') TWCGL = Transform(orn, trans) TWC = TWCGL * self.TCCGL.inverse() obs.update(TWC=TWC.toHomogeneousMatrix(), K=K, resolution=(self._shape[1], self._shape[0]), proj_mat=self._proj_mat, near=self._near, far=self._far) return obs
def setup_scene(self, obj_infos): labels = [obj['name'] for obj in obj_infos] bodies = self.body_cache.get_bodies_by_labels(labels) for (obj_info, body) in zip(obj_infos, bodies): TWO = Transform(obj_info['TWO']) body.pose = TWO q = obj_info.get('joints', None) if q is not None: body.q = q color = obj_info.get('color', None) if color is not None: pb.changeVisualShape(body.body_id, -1, physicsClientId=0, rgbaColor=color) return bodies
def set_extrinsic_T(self, TWC): TWC = Transform(TWC) TWCGL = TWC * self.TCCGL xyzw = TWCGL.quaternion.coeffs() wxyz = [xyzw[-1], *xyzw[:-1]] pitch, roll, yaw = transforms3d.euler.quat2euler(wxyz, axes='sxyz') yaw = yaw * 180 / np.pi pitch = pitch * 180 / np.pi roll = roll * 180 / np.pi yaw = (yaw % 360 + 360) % 360 distance = 0.0001 self.set_extrinsic_bullet(target=TWCGL.translation, distance=distance, pitch=pitch, roll=roll, yaw=yaw)
def set_extrinsic_spherical(self, target=(0, 0, 0), rho=0.6, theta=np.pi / 4, phi=0, roll=0): """ Angles in *radians*. https://fr.wikipedia.org/wiki/Coordonn%C3%A9es_sph%C3%A9riques#/media/Fichier:Spherical_Coordinates_(Colatitude,_Longitude)_(b).svg """ x = rho * np.sin(theta) * np.cos(phi) y = rho * np.sin(theta) * np.sin(phi) z = rho * np.cos(theta) t = np.array([x, y, z]) R = transforms3d.euler.euler2mat(np.pi, theta, phi, axes='sxyz') R = R @ transforms3d.euler.euler2mat( 0, 0, -np.pi / 2 + roll, axes='sxyz') t += np.array(target) TWC = Transform(R, t) self.set_extrinsic_T(TWC)
def render_images(self, cam_infos, render_depth=False, render_mask=True): cam_obs = [] for cam_info in cam_infos: K = cam_info['K'] TWC = Transform(cam_info['TWC']) resolution = cam_info['resolution'] cam = Camera(resolution=resolution, client_id=self.client_id) cam.set_intrinsic_K(K) cam.set_extrinsic_T(TWC) cam_obs_ = cam.get_state() if self.background_color is not None: im = cam_obs_['rgb'] mask = cam_obs_['mask'] im[np.logical_or(mask < 0, mask == 255)] = self.background_color if render_depth: depth = cam_obs_['depth'] near, far = cam_obs_['near'], cam_obs_['far'] z_n = 2 * depth - 1 z_e = 2 * near * far / (far + near - z_n * (far - near)) z_e[np.logical_or(mask < 0, mask == 255)] = 0. cam_obs_['depth'] = z_e cam_obs.append(cam_obs_) return cam_obs
def __getitem__(self, frame_id): row = self.frame_index.iloc[frame_id] rgb_path = self.ds_dir / row.rgb_path rgb = np.asarray(Image.open(rgb_path))[..., :3] h, w = rgb.shape[:2] mask = None bbox = None keypoints_2d = np.zeros((17, 2), dtype=np.float) * float('nan') if self.annotation_types['segmentation']: mask = np.asarray( Image.open(rgb_path.parent.parent / 'seg' / rgb_path.with_suffix('.png').name)) mask = (mask[..., 0] == 0).astype(np.uint8) * 255 ids = np.where(mask) x1, y1, x2, y2 = np.min(ids[1]), np.min(ids[0]), np.max( ids[1]), np.max(ids[0]) bbox = np.array([x1, y1, x2, y2]) if self.annotation_types['keypoints_2d']: infos = json.loads( (rgb_path.parent.parent / 'd3_preds' / rgb_path.with_suffix('.json').name).read_text()) x1, x2, y1, y2 = np.array(infos['bbox']).flatten() bbox = np.array([x1, y1, x2, y2]) mask = make_masks_from_det(bbox[None], h, w).numpy().astype( np.uint8)[0] * 255 pts = np.transpose(np.array(infos['reprojection'])) if 'visibility' in infos: visibility = infos['visibility'][:-2] pts[np.invert(visibility), :] = -1.0 keypoints_2d = pts elif self.annotation_types['keypoints_3d']: cam_infos = json.loads( (rgb_path.parent.parent / 'caminfo' / rgb_path.with_suffix('.json').name).read_text()) vertex_infos = json.loads( (rgb_path.parent.parent.parent / 'vertex' / rgb_path.with_suffix('.json').name).read_text()) joint_infos = json.loads( (rgb_path.parent.parent.parent / 'joint' / rgb_path.with_suffix('.json').name).read_text()) keypoints_2d = make_2d_keypoints(vertex_infos, joint_infos, cam_infos) if self.annotation_types['cam_info']: infos = json.loads( (rgb_path.parent.parent / 'caminfo' / rgb_path.with_suffix('.json').name).read_text()) H, W = infos['FilmHeight'], infos['FilmWidth'] assert rgb.shape[:2] == (H, W) fov = infos['Fov'] * np.pi / 180 f = W / (2 * np.tan(fov / 2)) # From UE, differs from pybullet. trans = np.array([infos['Location'][k] for k in ('X', 'Y', 'Z')]) * 0.001 pitch, yaw, roll = np.array( [infos['Rotation'][k] for k in ('Pitch', 'Yaw', 'Roll')]) trans[1] *= -1 yaw = -180 + yaw roll = -roll pitch = -pitch pitch, yaw, roll = np.array([pitch, yaw, roll]) * np.pi / 180 R = np.asarray(make_rotation(pitch, -yaw, -roll)) R_NORMAL_UE = np.array([ [0, -1, 0], [0, 0, -1], [1, 0, 0], ]) R = R @ R_NORMAL_UE.T T0C = Transform(R, trans).toHomogeneousMatrix() else: f = self.focal_length_when_unknown T0C = np.zeros((4, 4)) K = np.array([[f, 0, w / 2], [0, f, h / 2], [0, 0, 1]]) cam = dict(K=K, T0C=T0C, TWC=T0C, resolution=(w, h)) angles = None if self.annotation_types['angles']: angles = json.loads( (rgb_path.parent.parent.parent / 'angles' / rgb_path.with_suffix('.json').name).read_text()) q = np.array(angles)[:4] * np.pi / 180 q[0] *= -1 else: q = np.array([np.nan for _ in range(4)]) q = { joint_name: q[n] for n, joint_name in enumerate([ 'Model_Rotation', 'Rotation_Base', 'Base_Elbow', 'Elbow_Wrist' ]) } obs = dict() TWO = Transform((0, 0, 0, 1), (0, 0, 0)).toHomogeneousMatrix() robot = dict(joints=q, name='owi535', id_in_segm=255, bbox=bbox, TWO=TWO, keypoints_2d=keypoints_2d) obs['objects'] = [robot] obs['camera'] = cam obs['frame_info'] = row.to_dict() return rgb, mask, obs
class Camera: def __init__(self, resolution=(320, 240), near=0.01, far=10, client_id=0): assert client_id >= 0, 'Please provide a client id (0 by default)' h, w = min(resolution), max(resolution) self._client_id = client_id self._near = near self._far = far self._shape = (h, w) self._rgba = np.zeros(self._shape + (4, ), dtype=np.uint8) self._mask = np.zeros(self._shape, dtype=np.uint8) self._depth = np.zeros(self._shape, dtype=np.float32) self._render_options = dict() self._render_flags = 0 self.mask_link_index(True) self.casts_shadow(True) # Transform between standard camera coordinate (z forward) and OPENGL camera coordinate system wxyz = transforms3d.euler.euler2quat(np.pi / 2, 0, 0, axes='rxyz') xyzw = [*wxyz[1:], wxyz[0]] self.TCCGL = Transform(xyzw, (0, 0, 0)) # Set some default parameters self.set_extrinsic_bullet(target=(0, 0, 0), distance=1.6, yaw=90, pitch=-35, roll=0) self.set_intrinsic_fov(90) def set_extrinsic_bullet(self, target, distance, yaw, pitch, roll): """ Angles in *degrees*. """ up = 'z' self._view_params = dict(yaw=yaw, pitch=pitch, roll=roll, target=target, distance=distance) self._view_mat = pb.computeViewMatrixFromYawPitchRoll( target, distance, yaw, pitch, roll, 'xyz'.index(up)) def set_extrinsic_T(self, TWC): TWC = Transform(TWC) TWCGL = TWC * self.TCCGL xyzw = TWCGL.quaternion.coeffs() wxyz = [xyzw[-1], *xyzw[:-1]] pitch, roll, yaw = transforms3d.euler.quat2euler(wxyz, axes='sxyz') yaw = yaw * 180 / np.pi pitch = pitch * 180 / np.pi roll = roll * 180 / np.pi yaw = (yaw % 360 + 360) % 360 distance = 0.0001 self.set_extrinsic_bullet(target=TWCGL.translation, distance=distance, pitch=pitch, roll=roll, yaw=yaw) def set_extrinsic_spherical(self, target=(0, 0, 0), rho=0.6, theta=np.pi / 4, phi=0, roll=0): """ Angles in *radians*. https://fr.wikipedia.org/wiki/Coordonn%C3%A9es_sph%C3%A9riques#/media/Fichier:Spherical_Coordinates_(Colatitude,_Longitude)_(b).svg """ x = rho * np.sin(theta) * np.cos(phi) y = rho * np.sin(theta) * np.sin(phi) z = rho * np.cos(theta) t = np.array([x, y, z]) R = transforms3d.euler.euler2mat(np.pi, theta, phi, axes='sxyz') R = R @ transforms3d.euler.euler2mat( 0, 0, -np.pi / 2 + roll, axes='sxyz') t += np.array(target) TWC = Transform(R, t) self.set_extrinsic_T(TWC) def set_intrinsic_K(self, K): h, w = self._shape proj_mat = proj_from_K(K, near=self._near, far=self._far, h=h, w=w).flatten() assert np.allclose(proj_mat[11], -1) self._proj_mat = proj_mat self._K = K self._proj_params = None def set_intrinsic_fov(self, fov): h, w = self._shape self._proj_params = dict(fov=fov) self._proj_mat = pb.computeProjectionMatrixFOV(fov=fov, aspect=w / h, nearVal=self._near, farVal=self._far) self._K = None def set_intrinsic_f(self, *args): if len(args) == 2: fx, fy = args raise NotImplementedError else: assert len(args) == 1 fy = args[0] h, w = self._shape fov_y = np.arctan(h * 0.5 / fy) * 180 / np.pi fov = fov_y * 2 self.set_intrinsic_fov(fov) def get_state(self): obs = dict() # Get images rgba, mask_uint8, mask_int, depth = self._shot() rgb = rgba[..., :3] obs.update(rgb=rgb, mask=mask_uint8, depth=depth, mask_int=mask_int) # Get intrinsic, extrinsic parameters with standard conventions. h, w = self._shape if self._K is not None: K = self._K else: K = K_from_fov(self._proj_params['fov']) trans = self._view_params['target'] orn = euler2quat([ self._view_params[k] * np.pi / 180 for k in ('pitch', 'roll', 'yaw') ], axes='sxyz') TWCGL = Transform(orn, trans) TWC = TWCGL * self.TCCGL.inverse() obs.update(TWC=TWC.toHomogeneousMatrix(), K=K, resolution=(self._shape[1], self._shape[0]), proj_mat=self._proj_mat, near=self._near, far=self._far) return obs def _shot(self): """ Computes a RGB image, a depth buffer and a segmentation mask buffer with body unique ids of visible objects for each pixel. """ h, w = self._shape renderer = pb.ER_BULLET_HARDWARE_OPENGL w, h, rgba, depth, mask = pb.getCameraImage( width=w, height=h, projectionMatrix=self._proj_mat, viewMatrix=self._view_mat, renderer=renderer, flags=self._render_flags, **self._render_options, physicsClientId=self._client_id) rgba = np.asarray(rgba, dtype=np.uint8).reshape((h, w, 4)) depth = np.asarray(depth, dtype=np.float32).reshape((h, w)) mask_uint8 = np.asarray(mask, dtype=np.uint8).reshape((h, w)) mask_int = np.asarray(mask, dtype=np.int).reshape((h, w)) return rgba, mask_uint8, mask_int, depth def _project(self, fov, near, far): """ Apply camera projection matrix. Args: fov (float): Field of view. near float): Near plane distance. far (float): Far plane distance. """ self.near = near self.far = far h, w = self._shape self._proj_mat = pb.computeProjectionMatrixFOV(fov=fov, aspect=w / h, nearVal=near, farVal=far) def mask_link_index(self, flag): """ If is enabled, the mask combines the object unique id and link index as follows: value = objectUniqueId + ((linkIndex+1)<<24). """ if flag: self._render_flags |= pb.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX else: self._render_flags &= ~pb.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX def casts_shadow(self, flag): """ 1 for shadows, 0 for no shadows. """ self._render_options['shadow'] = 1 if flag else 0
def __getitem__(self, idx): row = self.frame_index.iloc[idx] rgb_path = Path(row.rgb_path) rgb = np.asarray(Image.open(rgb_path)) assert rgb.shape[-1] == 3, rgb_path mask = None annotations = json.loads( rgb_path.with_suffix('').with_suffix('.json').read_text()) # Camera TWC = np.eye(4) camera_infos_path = self.base_dir / '_camera_settings.json' h, w = rgb.shape[0], rgb.shape[1] if camera_infos_path.exists(): cam_infos = json.loads(camera_infos_path.read_text()) assert len(cam_infos['camera_settings']) == 1 cam_infos = cam_infos['camera_settings'][0]['intrinsic_settings'] fx, fy, cx, cy = [cam_infos[k] for k in ('fx', 'fy', 'cx', 'cy')] else: fx, fy = 320, 320 cx, cy = w / 2, h / 2 K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) camera = dict( TWC=TWC, resolution=(w, h), K=K, ) label = self.label # Objects obj_data = annotations['objects'][0] if 'quaternion_xyzw' in obj_data: TWO = Transform(np.array(obj_data['quaternion_xyzw']), np.array(obj_data['location']) * self.scale).toHomogeneousMatrix() R_NORMAL_UE = np.array([ [0, -1, 0], [0, 0, -1], [1, 0, 0], ]) TWO[:3, :3] = TWO[:3, :3] @ R_NORMAL_UE else: TWO = np.eye(4) * float('nan') TWO = Transform(TWO) joints = annotations['sim_state']['joints'] joints = OrderedDict( {d['name'].split('/')[-1]: d['position'] for d in joints}) if self.label == 'iiwa7': joints = { k.replace('iiwa7_', 'iiwa_'): v for k, v in joints.items() } keypoints_2d = obj_data['keypoints'] keypoints_2d = np.concatenate( [np.array(kp['projected_location'])[None] for kp in keypoints_2d], axis=0) keypoints_2d = np.unique(keypoints_2d, axis=0) valid = np.logical_and( np.logical_and(keypoints_2d[:, 0] >= 0, keypoints_2d[:, 0] <= w - 1), np.logical_and(keypoints_2d[:, 1] >= 0, keypoints_2d[:, 1] <= h - 1)) keypoints_2d = keypoints_2d[valid] det_valid = len(keypoints_2d) >= 2 if det_valid: bbox = np.concatenate( [np.min(keypoints_2d, axis=0), np.max(keypoints_2d, axis=0)]) dx, dy = bbox[[2, 3]] - bbox[[0, 1]] if dx <= 20 or dy <= 20: det_valid = False if not det_valid or self.image_bbox: m = 1 / 5 keypoints_2d = np.array([[w * m, h * m], [w - w * m, h - h * m]]) bbox = np.concatenate( [np.min(keypoints_2d, axis=0), np.max(keypoints_2d, axis=0)]) mask = make_masks_from_det(bbox[None], h, w).numpy().astype( np.uint8)[0] * 1 keypoints = obj_data['keypoints'] TCO_keypoints_3d = { kp['name']: np.array(kp['location']) * self.scale for kp in keypoints } TCO_keypoints_3d = np.array( [TCO_keypoints_3d.get(k, np.nan) for k in self.keypoint_names]) keypoints_2d = { kp['name']: kp['projected_location'] for kp in keypoints } keypoints_2d = np.array( [keypoints_2d.get(k, np.nan) for k in self.keypoint_names]) robot = dict(label=label, name=label, joints=joints, TWO=TWO.toHomogeneousMatrix(), bbox=bbox, id_in_segm=1, keypoints_2d=keypoints_2d, TCO_keypoints_3d=TCO_keypoints_3d) state = dict(objects=[robot], camera=camera, frame_info=row.to_dict()) return rgb, mask, state
def pose(self): pos, orn = self._client.getBasePositionAndOrientation(self._body_id) return Transform(orn, pos).toHomogeneousMatrix()