class Camera: def __init__(self, resolution=(320, 240), near=0.01, far=10, client_id=0): assert client_id >= 0, 'Please provide a client id (0 by default)' h, w = min(resolution), max(resolution) self._client_id = client_id self._near = near self._far = far self._shape = (h, w) self._rgba = np.zeros(self._shape + (4, ), dtype=np.uint8) self._mask = np.zeros(self._shape, dtype=np.uint8) self._depth = np.zeros(self._shape, dtype=np.float32) self._render_options = dict() self._render_flags = 0 self.mask_link_index(True) self.casts_shadow(True) # Transform between standard camera coordinate (z forward) and OPENGL camera coordinate system wxyz = transforms3d.euler.euler2quat(np.pi / 2, 0, 0, axes='rxyz') xyzw = [*wxyz[1:], wxyz[0]] self.TCCGL = Transform(xyzw, (0, 0, 0)) # Set some default parameters self.set_extrinsic_bullet(target=(0, 0, 0), distance=1.6, yaw=90, pitch=-35, roll=0) self.set_intrinsic_fov(90) def set_extrinsic_bullet(self, target, distance, yaw, pitch, roll): """ Angles in *degrees*. """ up = 'z' self._view_params = dict(yaw=yaw, pitch=pitch, roll=roll, target=target, distance=distance) self._view_mat = pb.computeViewMatrixFromYawPitchRoll( target, distance, yaw, pitch, roll, 'xyz'.index(up)) def set_extrinsic_T(self, TWC): TWC = Transform(TWC) TWCGL = TWC * self.TCCGL xyzw = TWCGL.quaternion.coeffs() wxyz = [xyzw[-1], *xyzw[:-1]] pitch, roll, yaw = transforms3d.euler.quat2euler(wxyz, axes='sxyz') yaw = yaw * 180 / np.pi pitch = pitch * 180 / np.pi roll = roll * 180 / np.pi yaw = (yaw % 360 + 360) % 360 distance = 0.0001 self.set_extrinsic_bullet(target=TWCGL.translation, distance=distance, pitch=pitch, roll=roll, yaw=yaw) def set_extrinsic_spherical(self, target=(0, 0, 0), rho=0.6, theta=np.pi / 4, phi=0, roll=0): """ Angles in *radians*. https://fr.wikipedia.org/wiki/Coordonn%C3%A9es_sph%C3%A9riques#/media/Fichier:Spherical_Coordinates_(Colatitude,_Longitude)_(b).svg """ x = rho * np.sin(theta) * np.cos(phi) y = rho * np.sin(theta) * np.sin(phi) z = rho * np.cos(theta) t = np.array([x, y, z]) R = transforms3d.euler.euler2mat(np.pi, theta, phi, axes='sxyz') R = R @ transforms3d.euler.euler2mat( 0, 0, -np.pi / 2 + roll, axes='sxyz') t += np.array(target) TWC = Transform(R, t) self.set_extrinsic_T(TWC) def set_intrinsic_K(self, K): h, w = self._shape proj_mat = proj_from_K(K, near=self._near, far=self._far, h=h, w=w).flatten() assert np.allclose(proj_mat[11], -1) self._proj_mat = proj_mat self._K = K self._proj_params = None def set_intrinsic_fov(self, fov): h, w = self._shape self._proj_params = dict(fov=fov) self._proj_mat = pb.computeProjectionMatrixFOV(fov=fov, aspect=w / h, nearVal=self._near, farVal=self._far) self._K = None def set_intrinsic_f(self, *args): if len(args) == 2: fx, fy = args raise NotImplementedError else: assert len(args) == 1 fy = args[0] h, w = self._shape fov_y = np.arctan(h * 0.5 / fy) * 180 / np.pi fov = fov_y * 2 self.set_intrinsic_fov(fov) def get_state(self): obs = dict() # Get images rgba, mask, depth = self._shot() rgb = rgba[..., :3] obs.update(rgb=rgb, mask=mask, depth=depth) # Get intrinsic, extrinsic parameters with standard conventions. h, w = self._shape if self._K is not None: K = self._K else: K = K_from_fov(self._proj_params['fov']) trans = self._view_params['target'] orn = euler2quat([ self._view_params[k] * np.pi / 180 for k in ('pitch', 'roll', 'yaw') ], axes='sxyz') TWCGL = Transform(orn, trans) TWC = TWCGL * self.TCCGL.inverse() obs.update(TWC=TWC.toHomogeneousMatrix(), K=K, resolution=(self._shape[1], self._shape[0]), proj_mat=self._proj_mat, near=self._near, far=self._far) return obs def _shot(self): """ Computes a RGB image, a depth buffer and a segmentation mask buffer with body unique ids of visible objects for each pixel. """ h, w = self._shape renderer = pb.ER_BULLET_HARDWARE_OPENGL w, h, rgba, depth, mask = pb.getCameraImage( width=w, height=h, projectionMatrix=self._proj_mat, viewMatrix=self._view_mat, renderer=renderer, flags=self._render_flags, **self._render_options, physicsClientId=self._client_id) rgba = np.asarray(rgba, dtype=np.uint8).reshape((h, w, 4)) depth = np.asarray(depth, dtype=np.float32).reshape((h, w)) mask = np.asarray(mask, dtype=np.uint8).reshape((h, w)) return rgba, mask, depth def _project(self, fov, near, far): """ Apply camera projection matrix. Args: fov (float): Field of view. near float): Near plane distance. far (float): Far plane distance. """ self.near = near self.far = far h, w = self._shape self._proj_mat = pb.computeProjectionMatrixFOV(fov=fov, aspect=w / h, nearVal=near, farVal=far) def mask_link_index(self, flag): """ If is enabled, the mask combines the object unique id and link index as follows: value = objectUniqueId + ((linkIndex+1)<<24). """ if flag: self._render_flags |= pb.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX else: self._render_flags &= ~pb.ER_SEGMENTATION_MASK_OBJECT_AND_LINKINDEX def casts_shadow(self, flag): """ 1 for shadows, 0 for no shadows. """ self._render_options['shadow'] = 1 if flag else 0
def __getitem__(self, frame_id): row = self.frame_index.iloc[frame_id] scene_id, view_id = row.scene_id, row.view_id view_id = int(view_id) view_id_str = f'{view_id:06d}' scene_id_str = f'{int(scene_id):06d}' scene_dir = self.base_dir / scene_id_str rgb_dir = scene_dir / 'rgb' if not rgb_dir.exists(): rgb_dir = scene_dir / 'gray' rgb_path = rgb_dir / f'{view_id_str}.png' if not rgb_path.exists(): rgb_path = rgb_path.with_suffix('.jpg') if not rgb_path.exists(): rgb_path = rgb_path.with_suffix('.tif') rgb = np.array(Image.open(rgb_path)) if rgb.ndim == 2: rgb = np.repeat(rgb[..., None], 3, axis=-1) rgb = rgb[..., :3] h, w = rgb.shape[:2] rgb = torch.as_tensor(rgb) cam_annotation = self.annotations[scene_id_str]['scene_camera'][str( view_id)] if 'cam_R_w2c' in cam_annotation: RC0 = np.array(cam_annotation['cam_R_w2c']).reshape(3, 3) tC0 = np.array(cam_annotation['cam_t_w2c']) * 0.001 TC0 = Transform(RC0, tC0) else: TC0 = Transform(np.eye(3), np.zeros(3)) K = np.array(cam_annotation['cam_K']).reshape(3, 3) T0C = TC0.inverse() T0C = T0C.toHomogeneousMatrix() camera = dict(T0C=T0C, K=K, TWC=T0C, resolution=rgb.shape[:2]) T0C = TC0.inverse() objects = [] mask = np.zeros((h, w), dtype=np.uint8) if 'scene_gt_info' in self.annotations[scene_id_str]: annotation = self.annotations[scene_id_str]['scene_gt'][str( view_id)] n_objects = len(annotation) visib = self.annotations[scene_id_str]['scene_gt_info'][str( view_id)] for n in range(n_objects): RCO = np.array(annotation[n]['cam_R_m2c']).reshape(3, 3) tCO = np.array(annotation[n]['cam_t_m2c']) * 0.001 TCO = Transform(RCO, tCO) T0O = T0C * TCO T0O = T0O.toHomogeneousMatrix() obj_id = annotation[n]['obj_id'] name = f'obj_{int(obj_id):06d}' bbox_visib = np.array(visib[n]['bbox_visib']) x, y, w, h = bbox_visib x1 = x y1 = y x2 = x + w y2 = y + h obj = dict(label=name, name=name, TWO=T0O, T0O=T0O, visib_fract=visib[n]['visib_fract'], id_in_segm=n + 1, bbox=[x1, y1, x2, y2]) objects.append(obj) mask_path = scene_dir / 'mask_visib' / f'{view_id_str}_all.png' if mask_path.exists(): mask = np.array(Image.open(mask_path)) else: for n in range(n_objects): mask_n = np.array( Image.open(scene_dir / 'mask_visib' / f'{view_id_str}_{n:06d}.png')) mask[mask_n == 255] = n + 1 mask = torch.as_tensor(mask) if self.load_depth: depth_path = scene_dir / 'depth' / f'{view_id_str}.png' if not depth_path.exists(): depth_path = depth_path.with_suffix('.tif') depth = np.array(inout.load_depth(depth_path)) camera['depth'] = depth * cam_annotation['depth_scale'] / 1000 obs = dict( objects=objects, camera=camera, frame_info=row.to_dict(), ) return rgb, mask, obs