def __init__(self, render_mode: Optional[str] = None, g=10.0): self.max_speed = 8 self.max_torque = 2.0 self.dt = 0.05 self.g = g self.m = 1.0 self.l = 1.0 assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_dim = 500 self.screen = None self.clock = None self.isopen = True high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32) # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric # or normalised as max_torque == 2 by default. Ignoring the issue here as the default settings are too old # to update to follow the openai gym api self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1, ), dtype=np.float32) self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
def __init__(self, render_mode: Optional[str] = None): self.shape = (4, 12) self.start_state_index = np.ravel_multi_index((3, 0), self.shape) self.nS = np.prod(self.shape) self.nA = 4 # Cliff Location self._cliff = np.zeros(self.shape, dtype=bool) self._cliff[3, 1:-1] = True # Calculate transition probabilities and rewards self.P = {} for s in range(self.nS): position = np.unravel_index(s, self.shape) self.P[s] = {a: [] for a in range(self.nA)} self.P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) self.P[s][RIGHT] = self._calculate_transition_prob( position, [0, 1]) self.P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) self.P[s][LEFT] = self._calculate_transition_prob( position, [0, -1]) # Calculate initial state distribution # We always start in state (3, 0) self.initial_state_distrib = np.zeros(self.nS) self.initial_state_distrib[self.start_state_index] = 1.0 self.observation_space = spaces.Discrete(self.nS) self.action_space = spaces.Discrete(self.nA) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render)
def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): self.min_action = -1.0 self.max_action = 1.0 self.min_position = -1.2 self.max_position = 0.6 self.max_speed = 0.07 self.goal_position = ( 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version ) self.goal_velocity = goal_velocity self.power = 0.0015 self.low_state = np.array([self.min_position, -self.max_speed], dtype=np.float32) self.high_state = np.array([self.max_position, self.max_speed], dtype=np.float32) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(1, ), dtype=np.float32) self.observation_space = spaces.Box(low=self.low_state, high=self.high_state, dtype=np.float32)
def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): self.min_position = -1.2 self.max_position = 0.6 self.max_speed = 0.07 self.goal_position = 0.5 self.goal_velocity = goal_velocity self.force = 0.001 self.gravity = 0.0025 self.low = np.array([self.min_position, -self.max_speed], dtype=np.float32) self.high = np.array([self.max_position, self.max_speed], dtype=np.float32) assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.action_space = spaces.Discrete(3) self.observation_space = spaces.Box(self.low, self.high, dtype=np.float32)
def __init__(self, render_mode: Optional[str] = None): assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen = None self.clock = None self.isopen = True high = np.array( [1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32 ) low = -high self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32) self.action_space = spaces.Discrete(3) self.state = None
def __init__(self, render_mode: Optional[str] = None, natural=False, sab=False): self.action_space = spaces.Discrete(2) self.observation_space = spaces.Tuple( (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2)) ) # Flag to payout 1.5 on a "natural" blackjack win, like casino rules # Ref: http://www.bicyclecards.com/how-to-play/blackjack/ self.natural = natural # Flag for full agreement with the (Sutton and Barto, 2018) definition. Overrides self.natural self.sab = sab assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render)
def __init__(self, render_mode: Optional[str] = None): self.gravity = 9.8 self.masscart = 1.0 self.masspole = 0.1 self.total_mass = self.masspole + self.masscart self.length = 0.5 # actually half the pole's length self.polemass_length = self.masspole * self.length self.force_mag = 10.0 self.tau = 0.02 # seconds between state updates self.kinematics_integrator = "euler" # Angle at which to fail the episode self.theta_threshold_radians = 12 * 2 * math.pi / 360 self.x_threshold = 2.4 # Angle limit set to 2 * theta_threshold_radians so failing observation # is still within bounds. high = np.array( [ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max, ], dtype=np.float32, ) self.action_space = spaces.Discrete(2) self.observation_space = spaces.Box(-high, high, dtype=np.float32) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.state = None self.steps_beyond_done = None
class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]): """ ### Description This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077). A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces in the left and right direction on the cart. ### Action Space The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction of the fixed force the cart is pushed with. | Num | Action | |-----|------------------------| | 0 | Push cart to the left | | 1 | Push cart to the right | **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it ### Observation Space The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities: | Num | Observation | Min | Max | |-----|-----------------------|---------------------|-------------------| | 0 | Cart Position | -4.8 | 4.8 | | 1 | Cart Velocity | -Inf | Inf | | 2 | Pole Angle | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) | | 3 | Pole Angular Velocity | -Inf | Inf | **Note:** While the ranges above denote the possible values for observation space of each element, it is not reflective of the allowed values of the state space in an unterminated episode. Particularly: - The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates if the cart leaves the `(-2.4, 2.4)` range. - The pole angle can be observed between `(-.418, .418)` radians (or **±24°**), but the episode terminates if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**) ### Rewards Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken, including the termination step, is allotted. The threshold for rewards is 475 for v1. ### Starting State All observations are assigned a uniformly random value in `(-0.05, 0.05)` ### Episode Termination The episode terminates if any one of the following occurs: 1. Pole Angle is greater than ±12° 2. Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display) 3. Episode length is greater than 500 (200 for v0) ### Arguments ``` gym.make('CartPole-v1') ``` No additional arguments are currently supported. """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": 50, } def __init__(self, render_mode: Optional[str] = None): self.gravity = 9.8 self.masscart = 1.0 self.masspole = 0.1 self.total_mass = self.masspole + self.masscart self.length = 0.5 # actually half the pole's length self.polemass_length = self.masspole * self.length self.force_mag = 10.0 self.tau = 0.02 # seconds between state updates self.kinematics_integrator = "euler" # Angle at which to fail the episode self.theta_threshold_radians = 12 * 2 * math.pi / 360 self.x_threshold = 2.4 # Angle limit set to 2 * theta_threshold_radians so failing observation # is still within bounds. high = np.array( [ self.x_threshold * 2, np.finfo(np.float32).max, self.theta_threshold_radians * 2, np.finfo(np.float32).max, ], dtype=np.float32, ) self.action_space = spaces.Discrete(2) self.observation_space = spaces.Box(-high, high, dtype=np.float32) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.state = None self.steps_beyond_done = None def step(self, action): err_msg = f"{action!r} ({type(action)}) invalid" assert self.action_space.contains(action), err_msg assert self.state is not None, "Call reset before using step method." x, x_dot, theta, theta_dot = self.state force = self.force_mag if action == 1 else -self.force_mag costheta = math.cos(theta) sintheta = math.sin(theta) # For the interested reader: # https://coneural.org/florian/papers/05_cart_pole.pdf temp = (force + self.polemass_length * theta_dot**2 * sintheta) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass if self.kinematics_integrator == "euler": x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc else: # semi-implicit euler x_dot = x_dot + self.tau * xacc x = x + self.tau * x_dot theta_dot = theta_dot + self.tau * thetaacc theta = theta + self.tau * theta_dot self.state = (x, x_dot, theta, theta_dot) done = bool(x < -self.x_threshold or x > self.x_threshold or theta < -self.theta_threshold_radians or theta > self.theta_threshold_radians) if not done: reward = 1.0 elif self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 1.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this " "environment has already returned done = True. You " "should always call 'reset()' once you receive 'done = " "True' -- any further steps are undefined behavior.") self.steps_beyond_done += 1 reward = 0.0 self.renderer.render_step() return np.array(self.state, dtype=np.float32), reward, done, {} def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, )) self.steps_beyond_done = None self.renderer.reset() self.renderer.render_step() if not return_info: return np.array(self.state, dtype=np.float32) else: return np.array(self.state, dtype=np.float32), {} def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[classic_control]`" ) if self.screen is None: pygame.init() if mode == "human": pygame.display.init() self.screen = pygame.display.set_mode( (self.screen_width, self.screen_height)) else: # mode in {"rgb_array", "single_rgb_array"} self.screen = pygame.Surface( (self.screen_width, self.screen_height)) if self.clock is None: self.clock = pygame.time.Clock() world_width = self.x_threshold * 2 scale = self.screen_width / world_width polewidth = 10.0 polelen = scale * (2 * self.length) cartwidth = 50.0 cartheight = 30.0 if self.state is None: return None x = self.state self.surf = pygame.Surface((self.screen_width, self.screen_height)) self.surf.fill((255, 255, 255)) l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 axleoffset = cartheight / 4.0 cartx = x[0] * scale + self.screen_width / 2.0 # MIDDLE OF CART carty = 100 # TOP OF CART cart_coords = [(l, b), (l, t), (r, t), (r, b)] cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords] gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0)) gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0)) l, r, t, b = ( -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2, ) pole_coords = [] for coord in [(l, b), (l, t), (r, t), (r, b)]: coord = pygame.math.Vector2(coord).rotate_rad(-x[2]) coord = (coord[0] + cartx, coord[1] + carty + axleoffset) pole_coords.append(coord) gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101)) gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101)) gfxdraw.aacircle( self.surf, int(cartx), int(carty + axleoffset), int(polewidth / 2), (129, 132, 203), ) gfxdraw.filled_circle( self.surf, int(cartx), int(carty + axleoffset), int(polewidth / 2), (129, 132, 203), ) gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0)) self.surf = pygame.transform.flip(self.surf, False, True) self.screen.blit(self.surf, (0, 0)) if mode == "human": pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose(np.array(pygame.surfarray.pixels3d( self.screen)), axes=(1, 0, 2)) def close(self): if self.screen is not None: import pygame pygame.display.quit() pygame.quit() self.isopen = False
class BipedalWalker(gym.Env, EzPickle): """ ### Description This is a simple 4-joint walker robot environment. There are two versions: - Normal, with slightly uneven terrain. - Hardcore, with ladders, stumps, pitfalls. To solve the normal version, you need to get 300 points in 1600 time steps. To solve the hardcore version, you need 300 points in 2000 time steps. A heuristic is provided for testing. It's also useful to get demonstrations to learn from. To run the heuristic: ``` python gym/envs/box2d/bipedal_walker.py ``` ### Action Space Actions are motor speed values in the [-1, 1] range for each of the 4 joints at both hips and knees. ### Observation Space State consists of hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements. There are no coordinates in the state vector. ### Rewards Reward is given for moving forward, totaling 300+ points up to the far end. If the robot falls, it gets -100. Applying motor torque costs a small amount of points. A more optimal agent will get a better score. ### Starting State The walker starts standing at the left end of the terrain with the hull horizontal, and both legs in the same position with a slight knee angle. ### Episode Termination The episode will terminate if the hull gets in contact with the ground or if the walker exceeds the right end of the terrain length. ### Arguments To use to the _hardcore_ environment, you need to specify the `hardcore=True` argument like below: ```python import gym env = gym.make("BipedalWalker-v3", hardcore=True) ``` ### Version History - v3: returns closest lidar trace instead of furthest; faster video recording - v2: Count energy spent - v1: Legs now report contact with ground; motors have higher torque and speed; ground has higher friction; lidar rendered less nervously. - v0: Initial version <!-- ### References --> ### Credits Created by Oleg Klimov """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": FPS, } def __init__(self, render_mode: Optional[str] = None, hardcore: bool = False): EzPickle.__init__(self) self.isopen = True self.world = Box2D.b2World() self.terrain = None self.hull = None self.prev_shaping = None self.hardcore = hardcore self.fd_polygon = fixtureDef( shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)]), friction=FRICTION, ) self.fd_edge = fixtureDef( shape=edgeShape(vertices=[(0, 0), (1, 1)]), friction=FRICTION, categoryBits=0x0001, ) # we use 5.0 to represent the joints moving at maximum # 5 x the rated speed due to impulses from ground contact etc. low = np.array( [ -math.pi, -5.0, -5.0, -5.0, -math.pi, -5.0, -math.pi, -5.0, -0.0, -math.pi, -5.0, -math.pi, -5.0, -0.0, ] + [-1.0] * 10 ).astype(np.float32) high = np.array( [ math.pi, 5.0, 5.0, 5.0, math.pi, 5.0, math.pi, 5.0, 5.0, math.pi, 5.0, math.pi, 5.0, 5.0, ] + [1.0] * 10 ).astype(np.float32) self.action_space = spaces.Box( np.array([-1, -1, -1, -1]).astype(np.float32), np.array([1, 1, 1, 1]).astype(np.float32), ) self.observation_space = spaces.Box(low, high) # state = [ # self.hull.angle, # Normal angles up to 0.5 here, but sure more is possible. # 2.0 * self.hull.angularVelocity / FPS, # 0.3 * vel.x * (VIEWPORT_W / SCALE) / FPS, # Normalized to get -1..1 range # 0.3 * vel.y * (VIEWPORT_H / SCALE) / FPS, # self.joints[ # 0 # ].angle, # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too) # self.joints[0].speed / SPEED_HIP, # self.joints[1].angle + 1.0, # self.joints[1].speed / SPEED_KNEE, # 1.0 if self.legs[1].ground_contact else 0.0, # self.joints[2].angle, # self.joints[2].speed / SPEED_HIP, # self.joints[3].angle + 1.0, # self.joints[3].speed / SPEED_KNEE, # 1.0 if self.legs[3].ground_contact else 0.0, # ] # state += [l.fraction for l in self.lidar] assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen = None self.clock = None def _destroy(self): if not self.terrain: return self.world.contactListener = None for t in self.terrain: self.world.DestroyBody(t) self.terrain = [] self.world.DestroyBody(self.hull) self.hull = None for leg in self.legs: self.world.DestroyBody(leg) self.legs = [] self.joints = [] def _generate_terrain(self, hardcore): GRASS, STUMP, STAIRS, PIT, _STATES_ = range(5) state = GRASS velocity = 0.0 y = TERRAIN_HEIGHT counter = TERRAIN_STARTPAD oneshot = False self.terrain = [] self.terrain_x = [] self.terrain_y = [] for i in range(TERRAIN_LENGTH): x = i * TERRAIN_STEP self.terrain_x.append(x) if state == GRASS and not oneshot: velocity = 0.8 * velocity + 0.01 * np.sign(TERRAIN_HEIGHT - y) if i > TERRAIN_STARTPAD: velocity += self.np_random.uniform(-1, 1) / SCALE # 1 y += velocity elif state == PIT and oneshot: counter = self.np_random.integers(3, 5) poly = [ (x, y), (x + TERRAIN_STEP, y), (x + TERRAIN_STEP, y - 4 * TERRAIN_STEP), (x, y - 4 * TERRAIN_STEP), ] self.fd_polygon.shape.vertices = poly t = self.world.CreateStaticBody(fixtures=self.fd_polygon) t.color1, t.color2 = (255, 255, 255), (153, 153, 153) self.terrain.append(t) self.fd_polygon.shape.vertices = [ (p[0] + TERRAIN_STEP * counter, p[1]) for p in poly ] t = self.world.CreateStaticBody(fixtures=self.fd_polygon) t.color1, t.color2 = (255, 255, 255), (153, 153, 153) self.terrain.append(t) counter += 2 original_y = y elif state == PIT and not oneshot: y = original_y if counter > 1: y -= 4 * TERRAIN_STEP elif state == STUMP and oneshot: counter = self.np_random.integers(1, 3) poly = [ (x, y), (x + counter * TERRAIN_STEP, y), (x + counter * TERRAIN_STEP, y + counter * TERRAIN_STEP), (x, y + counter * TERRAIN_STEP), ] self.fd_polygon.shape.vertices = poly t = self.world.CreateStaticBody(fixtures=self.fd_polygon) t.color1, t.color2 = (255, 255, 255), (153, 153, 153) self.terrain.append(t) elif state == STAIRS and oneshot: stair_height = +1 if self.np_random.random() > 0.5 else -1 stair_width = self.np_random.integers(4, 5) stair_steps = self.np_random.integers(3, 5) original_y = y for s in range(stair_steps): poly = [ ( x + (s * stair_width) * TERRAIN_STEP, y + (s * stair_height) * TERRAIN_STEP, ), ( x + ((1 + s) * stair_width) * TERRAIN_STEP, y + (s * stair_height) * TERRAIN_STEP, ), ( x + ((1 + s) * stair_width) * TERRAIN_STEP, y + (-1 + s * stair_height) * TERRAIN_STEP, ), ( x + (s * stair_width) * TERRAIN_STEP, y + (-1 + s * stair_height) * TERRAIN_STEP, ), ] self.fd_polygon.shape.vertices = poly t = self.world.CreateStaticBody(fixtures=self.fd_polygon) t.color1, t.color2 = (255, 255, 255), (153, 153, 153) self.terrain.append(t) counter = stair_steps * stair_width elif state == STAIRS and not oneshot: s = stair_steps * stair_width - counter - stair_height n = s / stair_width y = original_y + (n * stair_height) * TERRAIN_STEP oneshot = False self.terrain_y.append(y) counter -= 1 if counter == 0: counter = self.np_random.integers(TERRAIN_GRASS / 2, TERRAIN_GRASS) if state == GRASS and hardcore: state = self.np_random.integers(1, _STATES_) oneshot = True else: state = GRASS oneshot = True self.terrain_poly = [] for i in range(TERRAIN_LENGTH - 1): poly = [ (self.terrain_x[i], self.terrain_y[i]), (self.terrain_x[i + 1], self.terrain_y[i + 1]), ] self.fd_edge.shape.vertices = poly t = self.world.CreateStaticBody(fixtures=self.fd_edge) color = (76, 255 if i % 2 == 0 else 204, 76) t.color1 = color t.color2 = color self.terrain.append(t) color = (102, 153, 76) poly += [(poly[1][0], 0), (poly[0][0], 0)] self.terrain_poly.append((poly, color)) self.terrain.reverse() def _generate_clouds(self): # Sorry for the clouds, couldn't resist self.cloud_poly = [] for i in range(TERRAIN_LENGTH // 20): x = self.np_random.uniform(0, TERRAIN_LENGTH) * TERRAIN_STEP y = VIEWPORT_H / SCALE * 3 / 4 poly = [ ( x + 15 * TERRAIN_STEP * math.sin(3.14 * 2 * a / 5) + self.np_random.uniform(0, 5 * TERRAIN_STEP), y + 5 * TERRAIN_STEP * math.cos(3.14 * 2 * a / 5) + self.np_random.uniform(0, 5 * TERRAIN_STEP), ) for a in range(5) ] x1 = min(p[0] for p in poly) x2 = max(p[0] for p in poly) self.cloud_poly.append((poly, x1, x2)) def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) self._destroy() self.world.contactListener_bug_workaround = ContactDetector(self) self.world.contactListener = self.world.contactListener_bug_workaround self.game_over = False self.prev_shaping = None self.scroll = 0.0 self.lidar_render = 0 self._generate_terrain(self.hardcore) self._generate_clouds() init_x = TERRAIN_STEP * TERRAIN_STARTPAD / 2 init_y = TERRAIN_HEIGHT + 2 * LEG_H self.hull = self.world.CreateDynamicBody( position=(init_x, init_y), fixtures=HULL_FD ) self.hull.color1 = (127, 51, 229) self.hull.color2 = (76, 76, 127) self.hull.ApplyForceToCenter( (self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), 0), True ) self.legs = [] self.joints = [] for i in [-1, +1]: leg = self.world.CreateDynamicBody( position=(init_x, init_y - LEG_H / 2 - LEG_DOWN), angle=(i * 0.05), fixtures=LEG_FD, ) leg.color1 = (153 - i * 25, 76 - i * 25, 127 - i * 25) leg.color2 = (102 - i * 25, 51 - i * 25, 76 - i * 25) rjd = revoluteJointDef( bodyA=self.hull, bodyB=leg, localAnchorA=(0, LEG_DOWN), localAnchorB=(0, LEG_H / 2), enableMotor=True, enableLimit=True, maxMotorTorque=MOTORS_TORQUE, motorSpeed=i, lowerAngle=-0.8, upperAngle=1.1, ) self.legs.append(leg) self.joints.append(self.world.CreateJoint(rjd)) lower = self.world.CreateDynamicBody( position=(init_x, init_y - LEG_H * 3 / 2 - LEG_DOWN), angle=(i * 0.05), fixtures=LOWER_FD, ) lower.color1 = (153 - i * 25, 76 - i * 25, 127 - i * 25) lower.color2 = (102 - i * 25, 51 - i * 25, 76 - i * 25) rjd = revoluteJointDef( bodyA=leg, bodyB=lower, localAnchorA=(0, -LEG_H / 2), localAnchorB=(0, LEG_H / 2), enableMotor=True, enableLimit=True, maxMotorTorque=MOTORS_TORQUE, motorSpeed=1, lowerAngle=-1.6, upperAngle=-0.1, ) lower.ground_contact = False self.legs.append(lower) self.joints.append(self.world.CreateJoint(rjd)) self.drawlist = self.terrain + self.legs + [self.hull] class LidarCallback(Box2D.b2.rayCastCallback): def ReportFixture(self, fixture, point, normal, fraction): if (fixture.filterData.categoryBits & 1) == 0: return -1 self.p2 = point self.fraction = fraction return fraction self.lidar = [LidarCallback() for _ in range(10)] self.renderer.reset() if not return_info: return self.step(np.array([0, 0, 0, 0]))[0] else: return self.step(np.array([0, 0, 0, 0]))[0], {} def step(self, action: np.ndarray): # self.hull.ApplyForceToCenter((0, 20), True) -- Uncomment this to receive a bit of stability help control_speed = False # Should be easier as well if control_speed: self.joints[0].motorSpeed = float(SPEED_HIP * np.clip(action[0], -1, 1)) self.joints[1].motorSpeed = float(SPEED_KNEE * np.clip(action[1], -1, 1)) self.joints[2].motorSpeed = float(SPEED_HIP * np.clip(action[2], -1, 1)) self.joints[3].motorSpeed = float(SPEED_KNEE * np.clip(action[3], -1, 1)) else: self.joints[0].motorSpeed = float(SPEED_HIP * np.sign(action[0])) self.joints[0].maxMotorTorque = float( MOTORS_TORQUE * np.clip(np.abs(action[0]), 0, 1) ) self.joints[1].motorSpeed = float(SPEED_KNEE * np.sign(action[1])) self.joints[1].maxMotorTorque = float( MOTORS_TORQUE * np.clip(np.abs(action[1]), 0, 1) ) self.joints[2].motorSpeed = float(SPEED_HIP * np.sign(action[2])) self.joints[2].maxMotorTorque = float( MOTORS_TORQUE * np.clip(np.abs(action[2]), 0, 1) ) self.joints[3].motorSpeed = float(SPEED_KNEE * np.sign(action[3])) self.joints[3].maxMotorTorque = float( MOTORS_TORQUE * np.clip(np.abs(action[3]), 0, 1) ) self.world.Step(1.0 / FPS, 6 * 30, 2 * 30) pos = self.hull.position vel = self.hull.linearVelocity for i in range(10): self.lidar[i].fraction = 1.0 self.lidar[i].p1 = pos self.lidar[i].p2 = ( pos[0] + math.sin(1.5 * i / 10.0) * LIDAR_RANGE, pos[1] - math.cos(1.5 * i / 10.0) * LIDAR_RANGE, ) self.world.RayCast(self.lidar[i], self.lidar[i].p1, self.lidar[i].p2) state = [ self.hull.angle, # Normal angles up to 0.5 here, but sure more is possible. 2.0 * self.hull.angularVelocity / FPS, 0.3 * vel.x * (VIEWPORT_W / SCALE) / FPS, # Normalized to get -1..1 range 0.3 * vel.y * (VIEWPORT_H / SCALE) / FPS, self.joints[0].angle, # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too) self.joints[0].speed / SPEED_HIP, self.joints[1].angle + 1.0, self.joints[1].speed / SPEED_KNEE, 1.0 if self.legs[1].ground_contact else 0.0, self.joints[2].angle, self.joints[2].speed / SPEED_HIP, self.joints[3].angle + 1.0, self.joints[3].speed / SPEED_KNEE, 1.0 if self.legs[3].ground_contact else 0.0, ] state += [l.fraction for l in self.lidar] assert len(state) == 24 self.scroll = pos.x - VIEWPORT_W / SCALE / 5 shaping = ( 130 * pos[0] / SCALE ) # moving forward is a way to receive reward (normalized to get 300 on completion) shaping -= 5.0 * abs( state[0] ) # keep head straight, other than that and falling, any behavior is unpunished reward = 0 if self.prev_shaping is not None: reward = shaping - self.prev_shaping self.prev_shaping = shaping for a in action: reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1) # normalized to about -50.0 using heuristic, more optimal agent should spend less done = False if self.game_over or pos[0] < 0: reward = -100 done = True if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP: done = True self.renderer.render_step() return np.array(state, dtype=np.float32), reward, done, {} def render(self, mode: str = "human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode: str = "human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[box2d]`" ) if self.screen is None and mode == "human": pygame.init() pygame.display.init() self.screen = pygame.display.set_mode((VIEWPORT_W, VIEWPORT_H)) if self.clock is None: self.clock = pygame.time.Clock() self.surf = pygame.Surface((VIEWPORT_W + self.scroll * SCALE, VIEWPORT_H)) pygame.transform.scale(self.surf, (SCALE, SCALE)) pygame.draw.polygon( self.surf, color=(215, 215, 255), points=[ (self.scroll * SCALE, 0), (self.scroll * SCALE + VIEWPORT_W, 0), (self.scroll * SCALE + VIEWPORT_W, VIEWPORT_H), (self.scroll * SCALE, VIEWPORT_H), ], ) for poly, x1, x2 in self.cloud_poly: if x2 < self.scroll / 2: continue if x1 > self.scroll / 2 + VIEWPORT_W / SCALE: continue pygame.draw.polygon( self.surf, color=(255, 255, 255), points=[ (p[0] * SCALE + self.scroll * SCALE / 2, p[1] * SCALE) for p in poly ], ) gfxdraw.aapolygon( self.surf, [(p[0] * SCALE + self.scroll * SCALE / 2, p[1] * SCALE) for p in poly], (255, 255, 255), ) for poly, color in self.terrain_poly: if poly[1][0] < self.scroll: continue if poly[0][0] > self.scroll + VIEWPORT_W / SCALE: continue scaled_poly = [] for coord in poly: scaled_poly.append([coord[0] * SCALE, coord[1] * SCALE]) pygame.draw.polygon(self.surf, color=color, points=scaled_poly) gfxdraw.aapolygon(self.surf, scaled_poly, color) self.lidar_render = (self.lidar_render + 1) % 100 i = self.lidar_render if i < 2 * len(self.lidar): single_lidar = ( self.lidar[i] if i < len(self.lidar) else self.lidar[len(self.lidar) - i - 1] ) if hasattr(single_lidar, "p1") and hasattr(single_lidar, "p2"): pygame.draw.line( self.surf, color=(255, 0, 0), start_pos=(single_lidar.p1[0] * SCALE, single_lidar.p1[1] * SCALE), end_pos=(single_lidar.p2[0] * SCALE, single_lidar.p2[1] * SCALE), width=1, ) for obj in self.drawlist: for f in obj.fixtures: trans = f.body.transform if type(f.shape) is circleShape: pygame.draw.circle( self.surf, color=obj.color1, center=trans * f.shape.pos * SCALE, radius=f.shape.radius * SCALE, ) pygame.draw.circle( self.surf, color=obj.color2, center=trans * f.shape.pos * SCALE, radius=f.shape.radius * SCALE, ) else: path = [trans * v * SCALE for v in f.shape.vertices] if len(path) > 2: pygame.draw.polygon(self.surf, color=obj.color1, points=path) gfxdraw.aapolygon(self.surf, path, obj.color1) path.append(path[0]) pygame.draw.polygon( self.surf, color=obj.color2, points=path, width=1 ) gfxdraw.aapolygon(self.surf, path, obj.color2) else: pygame.draw.aaline( self.surf, start_pos=path[0], end_pos=path[1], color=obj.color1, ) flagy1 = TERRAIN_HEIGHT * SCALE flagy2 = flagy1 + 50 x = TERRAIN_STEP * 3 * SCALE pygame.draw.aaline( self.surf, color=(0, 0, 0), start_pos=(x, flagy1), end_pos=(x, flagy2) ) f = [ (x, flagy2), (x, flagy2 - 10), (x + 25, flagy2 - 5), ] pygame.draw.polygon(self.surf, color=(230, 51, 0), points=f) pygame.draw.lines( self.surf, color=(0, 0, 0), points=f + [f[0]], width=1, closed=False ) self.surf = pygame.transform.flip(self.surf, False, True) if mode == "human": self.screen.blit(self.surf, (-self.scroll * SCALE, 0)) pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose( np.array(pygame.surfarray.pixels3d(self.surf)), axes=(1, 0, 2) ) def close(self): if self.screen is not None: import pygame pygame.display.quit() pygame.quit() self.isopen = False
def __init__(self, render_mode: Optional[str] = None, hardcore: bool = False): EzPickle.__init__(self) self.isopen = True self.world = Box2D.b2World() self.terrain = None self.hull = None self.prev_shaping = None self.hardcore = hardcore self.fd_polygon = fixtureDef( shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)]), friction=FRICTION, ) self.fd_edge = fixtureDef( shape=edgeShape(vertices=[(0, 0), (1, 1)]), friction=FRICTION, categoryBits=0x0001, ) # we use 5.0 to represent the joints moving at maximum # 5 x the rated speed due to impulses from ground contact etc. low = np.array( [ -math.pi, -5.0, -5.0, -5.0, -math.pi, -5.0, -math.pi, -5.0, -0.0, -math.pi, -5.0, -math.pi, -5.0, -0.0, ] + [-1.0] * 10 ).astype(np.float32) high = np.array( [ math.pi, 5.0, 5.0, 5.0, math.pi, 5.0, math.pi, 5.0, 5.0, math.pi, 5.0, math.pi, 5.0, 5.0, ] + [1.0] * 10 ).astype(np.float32) self.action_space = spaces.Box( np.array([-1, -1, -1, -1]).astype(np.float32), np.array([1, 1, 1, 1]).astype(np.float32), ) self.observation_space = spaces.Box(low, high) # state = [ # self.hull.angle, # Normal angles up to 0.5 here, but sure more is possible. # 2.0 * self.hull.angularVelocity / FPS, # 0.3 * vel.x * (VIEWPORT_W / SCALE) / FPS, # Normalized to get -1..1 range # 0.3 * vel.y * (VIEWPORT_H / SCALE) / FPS, # self.joints[ # 0 # ].angle, # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too) # self.joints[0].speed / SPEED_HIP, # self.joints[1].angle + 1.0, # self.joints[1].speed / SPEED_KNEE, # 1.0 if self.legs[1].ground_contact else 0.0, # self.joints[2].angle, # self.joints[2].speed / SPEED_HIP, # self.joints[3].angle + 1.0, # self.joints[3].speed / SPEED_KNEE, # 1.0 if self.legs[3].ground_contact else 0.0, # ] # state += [l.fraction for l in self.lidar] assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen = None self.clock = None
def __init__( self, model_path, frame_skip, render_mode: Optional[str] = None, mujoco_bindings="mujoco", ): if model_path.startswith("/"): fullpath = model_path else: fullpath = path.join(path.dirname(__file__), "assets", model_path) if not path.exists(fullpath): raise OSError(f"File {fullpath} does not exist") if mujoco_bindings == "mujoco_py": logger.warn( "This version of the mujoco environments depends " "on the mujoco-py bindings, which are no longer maintained " "and may stop working. Please upgrade to the v4 versions of " "the environments (which depend on the mujoco python bindings instead), unless " "you are trying to precisely replicate previous works).") try: import mujoco_py self._mujoco_bindings = mujoco_py except ImportError as e: raise error.DependencyNotInstalled( "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)" .format(e)) self.model = self._mujoco_bindings.load_model_from_path(fullpath) self.sim = self._mujoco_bindings.MjSim(self.model) self.data = self.sim.data elif mujoco_bindings == "mujoco": try: import mujoco self._mujoco_bindings = mujoco except ImportError as e: raise error.DependencyNotInstalled( f"{e}. (HINT: you need to install mujoco)") self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath) self.data = self._mujoco_bindings.MjData(self.model) self.init_qpos = self.data.qpos.ravel().copy() self.init_qvel = self.data.qvel.ravel().copy() self._viewers = {} self.frame_skip = frame_skip self.viewer = None self.metadata = { "render_modes": [ "human", "rgb_array", "depth_array", "single_rgb_array", "single_depth_array", ], "render_fps": int(np.round(1.0 / self.dt)), } self._set_action_space() assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) action = self.action_space.sample() observation, _reward, done, _info = self.step(action) assert not done self._set_observation_space(observation)
class FrozenLakeEnv(Env): """ Frozen lake involves crossing a frozen lake from Start(S) to Goal(G) without falling into any Holes(H) by walking over the Frozen(F) lake. The agent may not always move in the intended direction due to the slippery nature of the frozen lake. ### Action Space The agent takes a 1-element vector for actions. The action space is `(dir)`, where `dir` decides direction to move in which can be: - 0: LEFT - 1: DOWN - 2: RIGHT - 3: UP ### Observation Space The observation is a value representing the agent's current position as current_row * nrows + current_col (where both the row and col start at 0). For example, the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15. The number of possible observations is dependent on the size of the map. For example, the 4x4 map has 16 possible observations. ### Rewards Reward schedule: - Reach goal(G): +1 - Reach hole(H): 0 - Reach frozen(F): 0 ### Arguments ``` gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True) ``` `desc`: Used to specify custom map for frozen lake. For example, desc=["SFFF", "FHFH", "FFFH", "HFFG"]. A random generated map can be specified by calling the function `generate_random_map`. For example, ``` from gym.envs.toy_text.frozen_lake import generate_random_map gym.make('FrozenLake-v1', desc=generate_random_map(size=8)) ``` `map_name`: ID to use any of the preloaded maps. "4x4":[ "SFFF", "FHFH", "FFFH", "HFFG" ] "8x8": [ "SFFFFFFF", "FFFFFFFF", "FFFHFFFF", "FFFFFHFF", "FFFHFFFF", "FHHFFFHF", "FHFFHFHF", "FFFHFFFG", ] `is_slippery`: True/False. If True will move in intended direction with probability of 1/3 else will move in either perpendicular direction with equal probability of 1/3 in both directions. For example, if action is left and is_slippery is True, then: - P(move left)=1/3 - P(move up)=1/3 - P(move down)=1/3 ### Version History * v1: Bug fixes to rewards * v0: Initial versions release (1.0.0) """ metadata = { "render_modes": ["human", "ansi", "rgb_array", "single_rgb_array"], "render_fps": 4, } def __init__( self, render_mode: Optional[str] = None, desc=None, map_name="4x4", is_slippery=True, ): if desc is None and map_name is None: desc = generate_random_map() elif desc is None: desc = MAPS[map_name] self.desc = desc = np.asarray(desc, dtype="c") self.nrow, self.ncol = nrow, ncol = desc.shape self.reward_range = (0, 1) nA = 4 nS = nrow * ncol self.initial_state_distrib = np.array( desc == b"S").astype("float64").ravel() self.initial_state_distrib /= self.initial_state_distrib.sum() self.P = {s: {a: [] for a in range(nA)} for s in range(nS)} def to_s(row, col): return row * ncol + col def inc(row, col, a): if a == LEFT: col = max(col - 1, 0) elif a == DOWN: row = min(row + 1, nrow - 1) elif a == RIGHT: col = min(col + 1, ncol - 1) elif a == UP: row = max(row - 1, 0) return (row, col) def update_probability_matrix(row, col, action): newrow, newcol = inc(row, col, action) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] done = bytes(newletter) in b"GH" reward = float(newletter == b"G") return newstate, reward, done for row in range(nrow): for col in range(ncol): s = to_s(row, col) for a in range(4): li = self.P[s][a] letter = desc[row, col] if letter in b"GH": li.append((1.0, s, 0, True)) else: if is_slippery: for b in [(a - 1) % 4, a, (a + 1) % 4]: li.append( (1.0 / 3.0, *update_probability_matrix(row, col, b))) else: li.append( (1.0, *update_probability_matrix(row, col, a))) self.observation_space = spaces.Discrete(nS) self.action_space = spaces.Discrete(nA) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) # pygame utils self.window_size = (min(64 * ncol, 512), min(64 * nrow, 512)) self.window_surface = None self.clock = None self.hole_img = None self.cracked_hole_img = None self.ice_img = None self.elf_images = None self.goal_img = None self.start_img = None def step(self, a): transitions = self.P[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d = transitions[i] self.s = s self.lastaction = a self.renderer.render_step() return (int(s), r, d, {"prob": p}) def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) self.s = categorical_sample(self.initial_state_distrib, self.np_random) self.lastaction = None self.renderer.reset() self.renderer.render_step() if not return_info: return int(self.s) else: return int(self.s), {"prob": 1} def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] if mode == "ansi": return self._render_text() elif mode in {"human", "rgb_array", "single_rgb_array"}: return self._render_gui(mode) def _render_gui(self, mode): try: import pygame except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[toy_text]`") if self.window_surface is None: pygame.init() pygame.display.init() pygame.display.set_caption("Frozen Lake") if mode == "human": self.window_surface = pygame.display.set_mode(self.window_size) elif mode in {"rgb_array", "single_rgb_array"}: self.window_surface = pygame.Surface(self.window_size) if self.clock is None: self.clock = pygame.time.Clock() if self.hole_img is None: file_name = path.join(path.dirname(__file__), "img/hole.png") self.hole_img = pygame.image.load(file_name) if self.cracked_hole_img is None: file_name = path.join(path.dirname(__file__), "img/cracked_hole.png") self.cracked_hole_img = pygame.image.load(file_name) if self.ice_img is None: file_name = path.join(path.dirname(__file__), "img/ice.png") self.ice_img = pygame.image.load(file_name) if self.goal_img is None: file_name = path.join(path.dirname(__file__), "img/goal.png") self.goal_img = pygame.image.load(file_name) if self.start_img is None: file_name = path.join(path.dirname(__file__), "img/stool.png") self.start_img = pygame.image.load(file_name) if self.elf_images is None: elfs = [ path.join(path.dirname(__file__), "img/elf_left.png"), path.join(path.dirname(__file__), "img/elf_down.png"), path.join(path.dirname(__file__), "img/elf_right.png"), path.join(path.dirname(__file__), "img/elf_up.png"), ] self.elf_images = [pygame.image.load(f_name) for f_name in elfs] cell_width = self.window_size[0] // self.ncol cell_height = self.window_size[1] // self.nrow smaller_cell_scale = 0.6 small_cell_w = int(smaller_cell_scale * cell_width) small_cell_h = int(smaller_cell_scale * cell_height) # prepare images last_action = self.lastaction if self.lastaction is not None else 1 elf_img = self.elf_images[last_action] elf_scale = min( small_cell_w / elf_img.get_width(), small_cell_h / elf_img.get_height(), ) elf_dims = ( elf_img.get_width() * elf_scale, elf_img.get_height() * elf_scale, ) elf_img = pygame.transform.scale(elf_img, elf_dims) hole_img = pygame.transform.scale(self.hole_img, (cell_width, cell_height)) cracked_hole_img = pygame.transform.scale(self.cracked_hole_img, (cell_width, cell_height)) ice_img = pygame.transform.scale(self.ice_img, (cell_width, cell_height)) goal_img = pygame.transform.scale(self.goal_img, (cell_width, cell_height)) start_img = pygame.transform.scale(self.start_img, (small_cell_w, small_cell_h)) desc = self.desc.tolist() for y in range(self.nrow): for x in range(self.ncol): rect = (x * cell_width, y * cell_height, cell_width, cell_height) if desc[y][x] == b"H": self.window_surface.blit(hole_img, (rect[0], rect[1])) elif desc[y][x] == b"G": self.window_surface.blit(ice_img, (rect[0], rect[1])) goal_rect = self._center_small_rect( rect, goal_img.get_size()) self.window_surface.blit(goal_img, goal_rect) elif desc[y][x] == b"S": self.window_surface.blit(ice_img, (rect[0], rect[1])) stool_rect = self._center_small_rect( rect, start_img.get_size()) self.window_surface.blit(start_img, stool_rect) else: self.window_surface.blit(ice_img, (rect[0], rect[1])) pygame.draw.rect(self.window_surface, (180, 200, 230), rect, 1) # paint the elf bot_row, bot_col = self.s // self.ncol, self.s % self.ncol cell_rect = ( bot_col * cell_width, bot_row * cell_height, cell_width, cell_height, ) if desc[bot_row][bot_col] == b"H": self.window_surface.blit(cracked_hole_img, (cell_rect[0], cell_rect[1])) else: elf_rect = self._center_small_rect(cell_rect, elf_img.get_size()) self.window_surface.blit(elf_img, elf_rect) if mode == "human": pygame.event.pump() pygame.display.update() self.clock.tick(self.metadata["render_fps"]) elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose(np.array( pygame.surfarray.pixels3d(self.window_surface)), axes=(1, 0, 2)) @staticmethod def _center_small_rect(big_rect, small_dims): offset_w = (big_rect[2] - small_dims[0]) / 2 offset_h = (big_rect[3] - small_dims[1]) / 2 return ( big_rect[0] + offset_w, big_rect[1] + offset_h, ) def _render_text(self): desc = self.desc.tolist() outfile = StringIO() row, col = self.s // self.ncol, self.s % self.ncol desc = [[c.decode("utf-8") for c in line] for line in desc] desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) if self.lastaction is not None: outfile.write( f" ({['Left', 'Down', 'Right', 'Up'][self.lastaction]})\n") else: outfile.write("\n") outfile.write("\n".join("".join(line) for line in desc) + "\n") with closing(outfile): return outfile.getvalue() def close(self): if self.window_surface is not None: import pygame pygame.display.quit() pygame.quit()
class CliffWalkingEnv(Env): """ This is a simple implementation of the Gridworld Cliff reinforcement learning task. Adapted from Example 6.6 (page 106) from [Reinforcement Learning: An Introduction by Sutton and Barto](http://incompleteideas.net/book/bookdraft2018jan1.pdf). With inspiration from: https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py ### Description The board is a 4x12 matrix, with (using NumPy matrix indexing): - [3, 0] as the start at bottom-left - [3, 11] as the goal at bottom-right - [3, 1..10] as the cliff at bottom-center If the agent steps on the cliff it returns to the start. An episode terminates when the agent reaches the goal. ### Actions There are 4 discrete deterministic actions: - 0: move up - 1: move right - 2: move down - 3: move left ### Observations There are 3x12 + 1 possible states. In fact, the agent cannot be at the cliff, nor at the goal (as this results the end of episode). They remain all the positions of the first 3 rows plus the bottom-left cell. The observation is simply the current position encoded as [flattened index](https://numpy.org/doc/stable/reference/generated/numpy.unravel_index.html). ### Reward Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward. ### Arguments ``` gym.make('CliffWalking-v0') ``` ### Version History - v0: Initial version release """ metadata = {"render_modes": ["human", "ansi"], "render_fps": 4} def __init__(self, render_mode: Optional[str] = None): self.shape = (4, 12) self.start_state_index = np.ravel_multi_index((3, 0), self.shape) self.nS = np.prod(self.shape) self.nA = 4 # Cliff Location self._cliff = np.zeros(self.shape, dtype=bool) self._cliff[3, 1:-1] = True # Calculate transition probabilities and rewards self.P = {} for s in range(self.nS): position = np.unravel_index(s, self.shape) self.P[s] = {a: [] for a in range(self.nA)} self.P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) self.P[s][RIGHT] = self._calculate_transition_prob( position, [0, 1]) self.P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) self.P[s][LEFT] = self._calculate_transition_prob( position, [0, -1]) # Calculate initial state distribution # We always start in state (3, 0) self.initial_state_distrib = np.zeros(self.nS) self.initial_state_distrib[self.start_state_index] = 1.0 self.observation_space = spaces.Discrete(self.nS) self.action_space = spaces.Discrete(self.nA) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) def _limit_coordinates(self, coord: np.ndarray) -> np.ndarray: """Prevent the agent from falling out of the grid world.""" coord[0] = min(coord[0], self.shape[0] - 1) coord[0] = max(coord[0], 0) coord[1] = min(coord[1], self.shape[1] - 1) coord[1] = max(coord[1], 0) return coord def _calculate_transition_prob(self, current, delta): """Determine the outcome for an action. Transition Prob is always 1.0. Args: current: Current position on the grid as (row, col) delta: Change in position for transition Returns: Tuple of ``(1.0, new_state, reward, done)`` """ new_position = np.array(current) + np.array(delta) new_position = self._limit_coordinates(new_position).astype(int) new_state = np.ravel_multi_index(tuple(new_position), self.shape) if self._cliff[tuple(new_position)]: return [(1.0, self.start_state_index, -100, False)] terminal_state = (self.shape[0] - 1, self.shape[1] - 1) is_done = tuple(new_position) == terminal_state return [(1.0, new_state, -1, is_done)] def step(self, a): transitions = self.P[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d = transitions[i] self.s = s self.lastaction = a self.renderer.render_step() return (int(s), r, d, {"prob": p}) def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): super().reset(seed=seed) self.s = categorical_sample(self.initial_state_distrib, self.np_random) self.lastaction = None self.renderer.reset() self.renderer.render_step() if not return_info: return int(self.s) else: return int(self.s), {"prob": 1} def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode): assert mode in self.metadata["render_modes"] outfile = StringIO() if mode == "ansi" else sys.stdout for s in range(self.nS): position = np.unravel_index(s, self.shape) if self.s == s: output = " x " # Print terminal state elif position == (3, 11): output = " T " elif self._cliff[position]: output = " C " else: output = " o " if position[1] == 0: output = output.lstrip() if position[1] == self.shape[1] - 1: output = output.rstrip() output += "\n" outfile.write(output) outfile.write("\n") # No need to return anything for human if mode != "human": with closing(outfile): return outfile.getvalue()
class PendulumEnv(gym.Env): """ ### Description The inverted pendulum swingup problem is based on the classic problem in control theory. The system consists of a pendulum attached at one end to a fixed point, and the other end being free. The pendulum starts in a random position and the goal is to apply torque on the free end to swing it into an upright position, with its center of gravity right above the fixed point. The diagram below specifies the coordinate system used for the implementation of the pendulum's dynamic equations. ![Pendulum Coordinate System](./diagrams/pendulum.png) - `x-y`: cartesian coordinates of the pendulum's end in meters. - `theta` : angle in radians. - `tau`: torque in `N m`. Defined as positive _counter-clockwise_. ### Action Space The action is a `ndarray` with shape `(1,)` representing the torque applied to free end of the pendulum. | Num | Action | Min | Max | |-----|--------|------|-----| | 0 | Torque | -2.0 | 2.0 | ### Observation Space The observation is a `ndarray` with shape `(3,)` representing the x-y coordinates of the pendulum's free end and its angular velocity. | Num | Observation | Min | Max | |-----|------------------|------|-----| | 0 | x = cos(theta) | -1.0 | 1.0 | | 1 | y = sin(angle) | -1.0 | 1.0 | | 2 | Angular Velocity | -8.0 | 8.0 | ### Rewards The reward function is defined as: *r = -(theta<sup>2</sup> + 0.1 * theta_dt<sup>2</sup> + 0.001 * torque<sup>2</sup>)* where `$\theta$` is the pendulum's angle normalized between *[-pi, pi]* (with 0 being in the upright position). Based on the above equation, the minimum reward that can be obtained is *-(pi<sup>2</sup> + 0.1 * 8<sup>2</sup> + 0.001 * 2<sup>2</sup>) = -16.2736044*, while the maximum reward is zero (pendulum is upright with zero velocity and no torque applied). ### Starting State The starting state is a random angle in *[-pi, pi]* and a random angular velocity in *[-1,1]*. ### Episode Termination The episode terminates at 200 time steps. ### Arguments - `g`: acceleration of gravity measured in *(m s<sup>-2</sup>)* used to calculate the pendulum dynamics. The default value is g = 10.0 . ``` gym.make('Pendulum-v1', g=9.81) ``` ### Version History * v1: Simplify the math equations, no difference in behavior. * v0: Initial versions release (1.0.0) """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": 30, } def __init__(self, render_mode: Optional[str] = None, g=10.0): self.max_speed = 8 self.max_torque = 2.0 self.dt = 0.05 self.g = g self.m = 1.0 self.l = 1.0 assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_dim = 500 self.screen = None self.clock = None self.isopen = True high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32) # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric # or normalised as max_torque == 2 by default. Ignoring the issue here as the default settings are too old # to update to follow the openai gym api self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1, ), dtype=np.float32) self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32) def step(self, u): th, thdot = self.state # th := theta g = self.g m = self.m l = self.l dt = self.dt u = np.clip(u, -self.max_torque, self.max_torque)[0] self.last_u = u # for rendering costs = angle_normalize(th)**2 + 0.1 * thdot**2 + 0.001 * (u**2) newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) newth = th + newthdot * dt self.state = np.array([newth, newthdot]) self.renderer.render_step() return self._get_obs(), -costs, False, {} def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): super().reset(seed=seed) high = np.array([np.pi, 1]) self.state = self.np_random.uniform(low=-high, high=high) self.last_u = None self.renderer.reset() self.renderer.render_step() if not return_info: return self._get_obs() else: return self._get_obs(), {} def _get_obs(self): theta, thetadot = self.state return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32) def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[classic_control]`" ) if self.screen is None: pygame.init() if mode == "human": pygame.display.init() self.screen = pygame.display.set_mode( (self.screen_dim, self.screen_dim)) else: # mode in {"rgb_array", "single_rgb_array"} self.screen = pygame.Surface( (self.screen_dim, self.screen_dim)) if self.clock is None: self.clock = pygame.time.Clock() self.surf = pygame.Surface((self.screen_dim, self.screen_dim)) self.surf.fill((255, 255, 255)) bound = 2.2 scale = self.screen_dim / (bound * 2) offset = self.screen_dim // 2 rod_length = 1 * scale rod_width = 0.2 * scale l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2 coords = [(l, b), (l, t), (r, t), (r, b)] transformed_coords = [] for c in coords: c = pygame.math.Vector2(c).rotate_rad(self.state[0] + np.pi / 2) c = (c[0] + offset, c[1] + offset) transformed_coords.append(c) gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77)) gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77)) gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)) gfxdraw.filled_circle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)) rod_end = (rod_length, 0) rod_end = pygame.math.Vector2(rod_end).rotate_rad(self.state[0] + np.pi / 2) rod_end = (int(rod_end[0] + offset), int(rod_end[1] + offset)) gfxdraw.aacircle(self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)) gfxdraw.filled_circle(self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)) fname = path.join(path.dirname(__file__), "assets/clockwise.png") img = pygame.image.load(fname) if self.last_u is not None: scale_img = pygame.transform.smoothscale( img, (scale * np.abs(self.last_u) / 2, scale * np.abs(self.last_u) / 2), ) is_flip = bool(self.last_u > 0) scale_img = pygame.transform.flip(scale_img, is_flip, True) self.surf.blit( scale_img, ( offset - scale_img.get_rect().centerx, offset - scale_img.get_rect().centery, ), ) # drawing axle gfxdraw.aacircle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0)) gfxdraw.filled_circle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0)) self.surf = pygame.transform.flip(self.surf, False, True) self.screen.blit(self.surf, (0, 0)) if mode == "human": pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() else: # mode == "rgb_array": return np.transpose(np.array(pygame.surfarray.pixels3d( self.screen)), axes=(1, 0, 2)) def close(self): if self.screen is not None: import pygame pygame.display.quit() pygame.quit() self.isopen = False
class Continuous_MountainCarEnv(gym.Env): """ ### Description The Mountain Car MDP is a deterministic MDP that consists of a car placed stochastically at the bottom of a sinusoidal valley, with the only possible actions being the accelerations that can be applied to the car in either direction. The goal of the MDP is to strategically accelerate the car to reach the goal state on top of the right hill. There are two versions of the mountain car domain in gym: one with discrete actions and one with continuous. This version is the one with continuous actions. This MDP first appeared in [Andrew Moore's PhD Thesis (1990)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-209.pdf) ``` @TECHREPORT{Moore90efficientmemory-based, author = {Andrew William Moore}, title = {Efficient Memory-based Learning for Robot Control}, institution = {University of Cambridge}, year = {1990} } ``` ### Observation Space The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following: | Num | Observation | Min | Max | Unit | |-----|--------------------------------------|------|-----|--------------| | 0 | position of the car along the x-axis | -Inf | Inf | position (m) | | 1 | velocity of the car | -Inf | Inf | position (m) | ### Action Space The action is a `ndarray` with shape `(1,)`, representing the directional force applied on the car. The action is clipped in the range `[-1,1]` and multiplied by a power of 0.0015. ### Transition Dynamics: Given an action, the mountain car follows the following transition dynamics: *velocity<sub>t+1</sub> = velocity<sub>t+1</sub> + force * self.power - 0.0025 * cos(3 * position<sub>t</sub>)* *position<sub>t+1</sub> = position<sub>t</sub> + velocity<sub>t+1</sub>* where force is the action clipped to the range `[-1,1]` and power is a constant 0.0015. The collisions at either end are inelastic with the velocity set to 0 upon collision with the wall. The position is clipped to the range [-1.2, 0.6] and velocity is clipped to the range [-0.07, 0.07]. ### Reward A negative reward of *-0.1 * action<sup>2</sup>* is received at each timestep to penalise for taking actions of large magnitude. If the mountain car reaches the goal then a positive reward of +100 is added to the negative reward for that timestep. ### Starting State The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`. The starting velocity of the car is always assigned to 0. ### Episode Termination The episode terminates if either of the following happens: 1. The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill) 2. The length of the episode is 999. ### Arguments ``` gym.make('MountainCarContinuous-v0') ``` ### Version History * v0: Initial versions release (1.0.0) """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": 30, } def __init__(self, render_mode: Optional[str] = None, goal_velocity=0): self.min_action = -1.0 self.max_action = 1.0 self.min_position = -1.2 self.max_position = 0.6 self.max_speed = 0.07 self.goal_position = ( 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version ) self.goal_velocity = goal_velocity self.power = 0.0015 self.low_state = np.array([self.min_position, -self.max_speed], dtype=np.float32) self.high_state = np.array([self.max_position, self.max_speed], dtype=np.float32) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen_width = 600 self.screen_height = 400 self.screen = None self.clock = None self.isopen = True self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(1, ), dtype=np.float32) self.observation_space = spaces.Box(low=self.low_state, high=self.high_state, dtype=np.float32) def step(self, action: np.ndarray): position = self.state[0] velocity = self.state[1] force = min(max(action[0], self.min_action), self.max_action) velocity += force * self.power - 0.0025 * math.cos(3 * position) if velocity > self.max_speed: velocity = self.max_speed if velocity < -self.max_speed: velocity = -self.max_speed position += velocity if position > self.max_position: position = self.max_position if position < self.min_position: position = self.min_position if position == self.min_position and velocity < 0: velocity = 0 # Convert a possible numpy bool to a Python bool. done = bool(position >= self.goal_position and velocity >= self.goal_velocity) reward = 0 if done: reward = 100.0 reward -= math.pow(action[0], 2) * 0.1 self.state = np.array([position, velocity], dtype=np.float32) self.renderer.render_step() return self.state, reward, done, {} def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): super().reset(seed=seed) self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) self.renderer.reset() self.renderer.render_step() if not return_info: return np.array(self.state, dtype=np.float32) else: return np.array(self.state, dtype=np.float32), {} def _height(self, xs): return np.sin(3 * xs) * 0.45 + 0.55 def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[classic_control]`" ) if self.screen is None: pygame.init() if mode == "human": pygame.display.init() self.screen = pygame.display.set_mode( (self.screen_width, self.screen_height)) else: # mode in {"rgb_array", "single_rgb_array"} self.screen = pygame.Surface( (self.screen_width, self.screen_height)) if self.clock is None: self.clock = pygame.time.Clock() world_width = self.max_position - self.min_position scale = self.screen_width / world_width carwidth = 40 carheight = 20 self.surf = pygame.Surface((self.screen_width, self.screen_height)) self.surf.fill((255, 255, 255)) pos = self.state[0] xs = np.linspace(self.min_position, self.max_position, 100) ys = self._height(xs) xys = list(zip((xs - self.min_position) * scale, ys * scale)) pygame.draw.aalines(self.surf, points=xys, closed=False, color=(0, 0, 0)) clearance = 10 l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0 coords = [] for c in [(l, b), (l, t), (r, t), (r, b)]: c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) coords.append(( c[0] + (pos - self.min_position) * scale, c[1] + clearance + self._height(pos) * scale, )) gfxdraw.aapolygon(self.surf, coords, (0, 0, 0)) gfxdraw.filled_polygon(self.surf, coords, (0, 0, 0)) for c in [(carwidth / 4, 0), (-carwidth / 4, 0)]: c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos)) wheel = ( int(c[0] + (pos - self.min_position) * scale), int(c[1] + clearance + self._height(pos) * scale), ) gfxdraw.aacircle(self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)) gfxdraw.filled_circle(self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)) flagx = int((self.goal_position - self.min_position) * scale) flagy1 = int(self._height(self.goal_position) * scale) flagy2 = flagy1 + 50 gfxdraw.vline(self.surf, flagx, flagy1, flagy2, (0, 0, 0)) gfxdraw.aapolygon( self.surf, [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], (204, 204, 0), ) gfxdraw.filled_polygon( self.surf, [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)], (204, 204, 0), ) self.surf = pygame.transform.flip(self.surf, False, True) self.screen.blit(self.surf, (0, 0)) if mode == "human": pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose(np.array(pygame.surfarray.pixels3d( self.screen)), axes=(1, 0, 2)) def close(self): if self.screen is not None: import pygame pygame.display.quit() pygame.quit() self.isopen = False
class AcrobotEnv(core.Env): """ ### Description The Acrobot environment is based on Sutton's work in ["Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding"](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html) and [Sutton and Barto's book](http://www.incompleteideas.net/book/the-book-2nd.html). The system consists of two links connected linearly to form a chain, with one end of the chain fixed. The joint between the two links is actuated. The goal is to apply torques on the actuated joint to swing the free end of the linear chain above a given height while starting from the initial state of hanging downwards. As seen in the **Gif**: two blue links connected by two green joints. The joint in between the two links is actuated. The goal is to swing the free end of the outer-link to reach the target height (black horizontal line above system) by applying torque on the actuator. ### Action Space The action is discrete, deterministic, and represents the torque applied on the actuated joint between the two links. | Num | Action | Unit | |-----|---------------------------------------|--------------| | 0 | apply -1 torque to the actuated joint | torque (N m) | | 1 | apply 0 torque to the actuated joint | torque (N m) | | 2 | apply 1 torque to the actuated joint | torque (N m) | ### Observation Space The observation is a `ndarray` with shape `(6,)` that provides information about the two rotational joint angles as well as their angular velocities: | Num | Observation | Min | Max | |-----|------------------------------|---------------------|-------------------| | 0 | Cosine of `theta1` | -1 | 1 | | 1 | Sine of `theta1` | -1 | 1 | | 2 | Cosine of `theta2` | -1 | 1 | | 3 | Sine of `theta2` | -1 | 1 | | 4 | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) | | 5 | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) | where - `theta1` is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly downwards. - `theta2` is ***relative to the angle of the first link.*** An angle of 0 corresponds to having the same angle between the two links. The angular velocities of `theta1` and `theta2` are bounded at ±4π, and ±9π rad/s respectively. A state of `[1, 0, 1, 0, ..., ...]` indicates that both links are pointing downwards. ### Rewards The goal is to have the free end reach a designated target height in as few steps as possible, and as such all steps that do not reach the goal incur a reward of -1. Achieving the target height results in termination with a reward of 0. The reward threshold is -100. ### Starting State Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity. ### Episode Termination The episode terminates if one of the following occurs: 1. The free end reaches the target height, which is constructed as: `-cos(theta1) - cos(theta2 + theta1) > 1.0` 2. Episode length is greater than 500 (200 for v0) ### Arguments No additional arguments are currently supported. ``` env = gym.make('Acrobot-v1') ``` By default, the dynamics of the acrobot follow those described in Sutton and Barto's book [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/11/node4.html). However, a `book_or_nips` parameter can be modified to change the pendulum dynamics to those described in the original [NeurIPS paper](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html). ``` # To change the dynamics as described above env.env.book_or_nips = 'nips' ``` See the following note and the [implementation](https://github.com/openai/gym/blob/master/gym/envs/classic_control/acrobot.py) for details: > The dynamics equations were missing some terms in the NIPS paper which are present in the book. R. Sutton confirmed in personal correspondence that the experimental results shown in the paper and the book were generated with the equations shown in the book. However, there is the option to run the domain with the paper equations by setting `book_or_nips = 'nips'` ### Version History - v1: Maximum number of steps increased from 200 to 500. The observation space for v0 provided direct readings of `theta1` and `theta2` in radians, having a range of `[-pi, pi]`. The v1 observation space as described here provides the sine and cosine of each angle instead. - v0: Initial versions release (1.0.0) (removed from gym for v1) ### References - Sutton, R. S. (1996). Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding. In D. Touretzky, M. C. Mozer, & M. Hasselmo (Eds.), Advances in Neural Information Processing Systems (Vol. 8). MIT Press. https://proceedings.neurips.cc/paper/1995/file/8f1d43620bc6bb580df6e80b0dc05c48-Paper.pdf - Sutton, R. S., Barto, A. G. (2018 ). Reinforcement Learning: An Introduction. The MIT Press. """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": 15, } dt = 0.2 LINK_LENGTH_1 = 1.0 # [m] LINK_LENGTH_2 = 1.0 # [m] LINK_MASS_1 = 1.0 #: [kg] mass of link 1 LINK_MASS_2 = 1.0 #: [kg] mass of link 2 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 LINK_MOI = 1.0 #: moments of inertia for both links MAX_VEL_1 = 4 * pi MAX_VEL_2 = 9 * pi AVAIL_TORQUE = [-1.0, 0.0, +1] torque_noise_max = 0.0 SCREEN_DIM = 500 #: use dynamics equations from the nips paper or the book book_or_nips = "book" action_arrow = None domain_fig = None actions_num = 3 def __init__(self, render_mode: Optional[str] = None): assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) self.screen = None self.clock = None self.isopen = True high = np.array( [1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32 ) low = -high self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32) self.action_space = spaces.Discrete(3) self.state = None def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None ): super().reset(seed=seed) self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,)).astype( np.float32 ) self.renderer.reset() self.renderer.render_step() if not return_info: return self._get_ob() else: return self._get_ob(), {} def step(self, a): s = self.state assert s is not None, "Call reset before using AcrobotEnv object." torque = self.AVAIL_TORQUE[a] # Add noise to the force action if self.torque_noise_max > 0: torque += self.np_random.uniform( -self.torque_noise_max, self.torque_noise_max ) # Now, augment the state with our force action so it can be passed to # _dsdt s_augmented = np.append(s, torque) ns = rk4(self._dsdt, s_augmented, [0, self.dt]) ns[0] = wrap(ns[0], -pi, pi) ns[1] = wrap(ns[1], -pi, pi) ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) self.state = ns terminal = self._terminal() reward = -1.0 if not terminal else 0.0 self.renderer.render_step() return self._get_ob(), reward, terminal, {} def _get_ob(self): s = self.state assert s is not None, "Call reset before using AcrobotEnv object." return np.array( [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32 ) def _terminal(self): s = self.state assert s is not None, "Call reset before using AcrobotEnv object." return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0) def _dsdt(self, s_augmented): m1 = self.LINK_MASS_1 m2 = self.LINK_MASS_2 l1 = self.LINK_LENGTH_1 lc1 = self.LINK_COM_POS_1 lc2 = self.LINK_COM_POS_2 I1 = self.LINK_MOI I2 = self.LINK_MOI g = 9.8 a = s_augmented[-1] s = s_augmented[:-1] theta1 = s[0] theta2 = s[1] dtheta1 = s[2] dtheta2 = s[3] d1 = ( m1 * lc1**2 + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 ) d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2 phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0) phi1 = ( -m2 * l1 * lc2 * dtheta2**2 * sin(theta2) - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2) + phi2 ) if self.book_or_nips == "nips": # the following line is consistent with the description in the # paper ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1) else: # the following line is consistent with the java implementation and the # book ddtheta2 = ( a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2 ) / (m2 * lc2**2 + I2 - d2**2 / d1) ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0 def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[classic_control]`" ) if self.screen is None: pygame.init() if mode == "human": pygame.display.init() self.screen = pygame.display.set_mode( (self.SCREEN_DIM, self.SCREEN_DIM) ) else: # mode in {"rgb_array", "single_rgb_array"} self.screen = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM)) if self.clock is None: self.clock = pygame.time.Clock() surf = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM)) surf.fill((255, 255, 255)) s = self.state bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default scale = self.SCREEN_DIM / (bound * 2) offset = self.SCREEN_DIM / 2 if s is None: return None p1 = [ -self.LINK_LENGTH_1 * cos(s[0]) * scale, self.LINK_LENGTH_1 * sin(s[0]) * scale, ] p2 = [ p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]) * scale, p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]) * scale, ] xys = np.array([[0, 0], p1, p2])[:, ::-1] thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2] link_lengths = [self.LINK_LENGTH_1 * scale, self.LINK_LENGTH_2 * scale] pygame.draw.line( surf, start_pos=(-2.2 * scale + offset, 1 * scale + offset), end_pos=(2.2 * scale + offset, 1 * scale + offset), color=(0, 0, 0), ) for ((x, y), th, llen) in zip(xys, thetas, link_lengths): x = x + offset y = y + offset l, r, t, b = 0, llen, 0.1 * scale, -0.1 * scale coords = [(l, b), (l, t), (r, t), (r, b)] transformed_coords = [] for coord in coords: coord = pygame.math.Vector2(coord).rotate_rad(th) coord = (coord[0] + x, coord[1] + y) transformed_coords.append(coord) gfxdraw.aapolygon(surf, transformed_coords, (0, 204, 204)) gfxdraw.filled_polygon(surf, transformed_coords, (0, 204, 204)) gfxdraw.aacircle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0)) gfxdraw.filled_circle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0)) surf = pygame.transform.flip(surf, False, True) self.screen.blit(surf, (0, 0)) if mode == "human": pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose( np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) )
class LunarLander(gym.Env, EzPickle): """ ### Description This environment is a classic rocket trajectory optimization problem. According to Pontryagin's maximum principle, it is optimal to fire the engine at full throttle or turn it off. This is the reason why this environment has discrete actions: engine on or off. There are two environment versions: discrete or continuous. The landing pad is always at coordinates (0,0). The coordinates are the first two numbers in the state vector. Landing outside of the landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land on its first attempt. To see a heuristic landing, run: ``` python gym/envs/box2d/lunar_lander.py ``` <!-- To play yourself, run: --> <!-- python examples/agents/keyboard_agent.py LunarLander-v2 --> ### Action Space There are four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine. ### Observation Space There are 8 states: the coordinates of the lander in `x` & `y`, its linear velocities in `x` & `y`, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not. ### Rewards Reward for moving from the top of the screen to the landing pad and coming to rest is about 100-140 points. If the lander moves away from the landing pad, it loses reward. If the lander crashes, it receives an additional -100 points. If it comes to rest, it receives an additional +100 points. Each leg with ground contact is +10 points. Firing the main engine is -0.3 points each frame. Firing the side engine is -0.03 points each frame. Solved is 200 points. ### Starting State The lander starts at the top center of the viewport with a random initial force applied to its center of mass. ### Episode Termination The episode finishes if: 1) the lander crashes (the lander body gets in contact with the moon); 2) the lander gets outside of the viewport (`x` coordinate is greater than 1); 3) the lander is not awake. From the [Box2D docs](https://box2d.org/documentation/md__d_1__git_hub_box2d_docs_dynamics.html#autotoc_md61), a body which is not awake is a body which doesn't move and doesn't collide with any other body: > When Box2D determines that a body (or group of bodies) has come to rest, > the body enters a sleep state which has very little CPU overhead. If a > body is awake and collides with a sleeping body, then the sleeping body > wakes up. Bodies will also wake up if a joint or contact attached to > them is destroyed. ### Arguments To use to the _continuous_ environment, you need to specify the `continuous=True` argument like below: ```python import gym env = gym.make( "LunarLander-v2", continuous: bool = False, gravity: float = -10.0, enable_wind: bool = False, wind_power: float = 15.0, turbulence_power: float = 1.5, ) ``` If `continuous=True` is passed, continuous actions (corresponding to the throttle of the engines) will be used and the action space will be `Box(-1, +1, (2,), dtype=np.float32)`. The first coordinate of an action determines the throttle of the main engine, while the second coordinate specifies the throttle of the lateral boosters. Given an action `np.array([main, lateral])`, the main engine will be turned off completely if `main < 0` and the throttle scales affinely from 50% to 100% for `0 <= main <= 1` (in particular, the main engine doesn't work with less than 50% power). Similarly, if `-0.5 < lateral < 0.5`, the lateral boosters will not fire at all. If `lateral < -0.5`, the left booster will fire, and if `lateral > 0.5`, the right booster will fire. Again, the throttle scales affinely from 50% to 100% between -1 and -0.5 (and 0.5 and 1, respectively). `gravity` dictates the gravitational constant, this is bounded to be within 0 and -12. If `enable_wind=True` is passed, there will be wind effects applied to the lander. The wind is generated using the function `tanh(sin(2 k (t+C)) + sin(pi k (t+C)))`. `k` is set to 0.01. `C` is sampled randomly between -9999 and 9999. `wind_power` dictates the maximum magnitude of linear wind applied to the craft. The recommended value for `wind_power` is between 0.0 and 20.0. `turbulence_power` dictates the maximum magnitude of rotational wind applied to the craft. The recommended value for `turbulence_power` is between 0.0 and 2.0. ### Version History - v2: Count energy spent and in v0.24, added turbulance with wind power and turbulence_power parameters - v1: Legs contact with ground added in state vector; contact with ground give +10 reward points, and -10 if then lose contact; reward renormalized to 200; harder initial random push. - v0: Initial version <!-- ### References --> ### Credits Created by Oleg Klimov """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": FPS, } def __init__( self, render_mode: Optional[str] = None, continuous: bool = False, gravity: float = -10.0, enable_wind: bool = False, wind_power: float = 15.0, turbulence_power: float = 1.5, ): EzPickle.__init__(self) assert ( -12.0 < gravity and gravity < 0.0 ), f"gravity (current value: {gravity}) must be between -12 and 0" self.gravity = gravity if 0.0 > wind_power or wind_power > 20.0: warnings.warn( colorize( f"WARN: wind_power value is recommended to be between 0.0 and 20.0, (current value: {wind_power})", "yellow", ), ) self.wind_power = wind_power if 0.0 > turbulence_power or turbulence_power > 2.0: warnings.warn( colorize( f"WARN: turbulence_power value is recommended to be between 0.0 and 2.0, (current value: {turbulence_power})", "yellow", ), ) self.turbulence_power = turbulence_power self.enable_wind = enable_wind self.wind_idx = np.random.randint(-9999, 9999) self.torque_idx = np.random.randint(-9999, 9999) self.screen = None self.clock = None self.isopen = True self.world = Box2D.b2World(gravity=(0, gravity)) self.moon = None self.lander = None self.particles = [] self.prev_reward = None self.continuous = continuous low = np.array([ # these are bounds for position # realistically the environment should have ended # long before we reach more than 50% outside -1.5, -1.5, # velocity bounds is 5x rated speed -5.0, -5.0, -math.pi, -5.0, -0.0, -0.0, ]).astype(np.float32) high = np.array([ # these are bounds for position # realistically the environment should have ended # long before we reach more than 50% outside 1.5, 1.5, # velocity bounds is 5x rated speed 5.0, 5.0, math.pi, 5.0, 1.0, 1.0, ]).astype(np.float32) # useful range is -1 .. +1, but spikes can be higher self.observation_space = spaces.Box(low, high) if self.continuous: # Action is two floats [main engine, left-right engines]. # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off self.action_space = spaces.Box(-1, +1, (2, ), dtype=np.float32) else: # Nop, fire left engine, main engine, right engine self.action_space = spaces.Discrete(4) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) def _destroy(self): if not self.moon: return self.world.contactListener = None self._clean_particles(True) self.world.DestroyBody(self.moon) self.moon = None self.world.DestroyBody(self.lander) self.lander = None self.world.DestroyBody(self.legs[0]) self.world.DestroyBody(self.legs[1]) def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) self._destroy() self.world.contactListener_keepref = ContactDetector(self) self.world.contactListener = self.world.contactListener_keepref self.game_over = False self.prev_shaping = None W = VIEWPORT_W / SCALE H = VIEWPORT_H / SCALE # terrain CHUNKS = 11 height = self.np_random.uniform(0, H / 2, size=(CHUNKS + 1, )) chunk_x = [W / (CHUNKS - 1) * i for i in range(CHUNKS)] self.helipad_x1 = chunk_x[CHUNKS // 2 - 1] self.helipad_x2 = chunk_x[CHUNKS // 2 + 1] self.helipad_y = H / 4 height[CHUNKS // 2 - 2] = self.helipad_y height[CHUNKS // 2 - 1] = self.helipad_y height[CHUNKS // 2 + 0] = self.helipad_y height[CHUNKS // 2 + 1] = self.helipad_y height[CHUNKS // 2 + 2] = self.helipad_y smooth_y = [ 0.33 * (height[i - 1] + height[i + 0] + height[i + 1]) for i in range(CHUNKS) ] self.moon = self.world.CreateStaticBody(shapes=edgeShape( vertices=[(0, 0), (W, 0)])) self.sky_polys = [] for i in range(CHUNKS - 1): p1 = (chunk_x[i], smooth_y[i]) p2 = (chunk_x[i + 1], smooth_y[i + 1]) self.moon.CreateEdgeFixture(vertices=[p1, p2], density=0, friction=0.1) self.sky_polys.append([p1, p2, (p2[0], H), (p1[0], H)]) self.moon.color1 = (0.0, 0.0, 0.0) self.moon.color2 = (0.0, 0.0, 0.0) initial_y = VIEWPORT_H / SCALE self.lander = self.world.CreateDynamicBody( position=(VIEWPORT_W / SCALE / 2, initial_y), angle=0.0, fixtures=fixtureDef( shape=polygonShape(vertices=[(x / SCALE, y / SCALE) for x, y in LANDER_POLY]), density=5.0, friction=0.1, categoryBits=0x0010, maskBits=0x001, # collide only with ground restitution=0.0, ), # 0.99 bouncy ) self.lander.color1 = (128, 102, 230) self.lander.color2 = (77, 77, 128) self.lander.ApplyForceToCenter( ( self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), ), True, ) self.legs = [] for i in [-1, +1]: leg = self.world.CreateDynamicBody( position=(VIEWPORT_W / SCALE / 2 - i * LEG_AWAY / SCALE, initial_y), angle=(i * 0.05), fixtures=fixtureDef( shape=polygonShape(box=(LEG_W / SCALE, LEG_H / SCALE)), density=1.0, restitution=0.0, categoryBits=0x0020, maskBits=0x001, ), ) leg.ground_contact = False leg.color1 = (128, 102, 230) leg.color2 = (77, 77, 128) rjd = revoluteJointDef( bodyA=self.lander, bodyB=leg, localAnchorA=(0, 0), localAnchorB=(i * LEG_AWAY / SCALE, LEG_DOWN / SCALE), enableMotor=True, enableLimit=True, maxMotorTorque=LEG_SPRING_TORQUE, motorSpeed=+0.3 * i, # low enough not to jump back into the sky ) if i == -1: rjd.lowerAngle = ( +0.9 - 0.5 ) # The most esoteric numbers here, angled legs have freedom to travel within rjd.upperAngle = +0.9 else: rjd.lowerAngle = -0.9 rjd.upperAngle = -0.9 + 0.5 leg.joint = self.world.CreateJoint(rjd) self.legs.append(leg) self.drawlist = [self.lander] + self.legs self.renderer.reset() if not return_info: return self.step(np.array([0, 0]) if self.continuous else 0)[0] else: return self.step(np.array([0, 0]) if self.continuous else 0)[0], {} def _create_particle(self, mass, x, y, ttl): p = self.world.CreateDynamicBody( position=(x, y), angle=0.0, fixtures=fixtureDef( shape=circleShape(radius=2 / SCALE, pos=(0, 0)), density=mass, friction=0.1, categoryBits=0x0100, maskBits=0x001, # collide only with ground restitution=0.3, ), ) p.ttl = ttl self.particles.append(p) self._clean_particles(False) return p def _clean_particles(self, all): while self.particles and (all or self.particles[0].ttl < 0): self.world.DestroyBody(self.particles.pop(0)) def step(self, action): # Update wind if self.enable_wind and not (self.legs[0].ground_contact or self.legs[1].ground_contact): # the function used for wind is tanh(sin(2 k x) + sin(pi k x)), # which is proven to never be periodic, k = 0.01 wind_mag = (math.tanh( math.sin(0.02 * self.wind_idx) + (math.sin(math.pi * 0.01 * self.wind_idx))) * self.wind_power) self.wind_idx += 1 self.lander.ApplyForceToCenter( (wind_mag, 0.0), True, ) # the function used for torque is tanh(sin(2 k x) + sin(pi k x)), # which is proven to never be periodic, k = 0.01 torque_mag = math.tanh( math.sin(0.02 * self.torque_idx) + (math.sin(math.pi * 0.01 * self.torque_idx))) * ( self.turbulence_power) self.torque_idx += 1 self.lander.ApplyTorque( (torque_mag), True, ) if self.continuous: action = np.clip(action, -1, +1).astype(np.float32) else: assert self.action_space.contains( action), f"{action!r} ({type(action)}) invalid " # Engines tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) side = (-tip[1], tip[0]) dispersion = [ self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2) ] m_power = 0.0 if (self.continuous and action[0] > 0.0) or (not self.continuous and action == 2): # Main engine if self.continuous: m_power = (np.clip(action[0], 0.0, 1.0) + 1.0) * 0.5 # 0.5..1.0 assert m_power >= 0.5 and m_power <= 1.0 else: m_power = 1.0 # 4 is move a bit downwards, +-2 for randomness ox = tip[0] * (4 / SCALE + 2 * dispersion[0]) + side[0] * dispersion[1] oy = -tip[1] * (4 / SCALE + 2 * dispersion[0]) - side[1] * dispersion[1] impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) p = self._create_particle( 3.5, # 3.5 is here to make particle speed adequate impulse_pos[0], impulse_pos[1], m_power, ) # particles are just a decoration p.ApplyLinearImpulse( (ox * MAIN_ENGINE_POWER * m_power, oy * MAIN_ENGINE_POWER * m_power), impulse_pos, True, ) self.lander.ApplyLinearImpulse( (-ox * MAIN_ENGINE_POWER * m_power, -oy * MAIN_ENGINE_POWER * m_power), impulse_pos, True, ) s_power = 0.0 if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1, 3]): # Orientation engines if self.continuous: direction = np.sign(action[1]) s_power = np.clip(np.abs(action[1]), 0.5, 1.0) assert s_power >= 0.5 and s_power <= 1.0 else: direction = action - 2 s_power = 1.0 ox = tip[0] * dispersion[0] + side[0] * ( 3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE) oy = -tip[1] * dispersion[0] - side[1] * ( 3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE) impulse_pos = ( self.lander.position[0] + ox - tip[0] * 17 / SCALE, self.lander.position[1] + oy + tip[1] * SIDE_ENGINE_HEIGHT / SCALE, ) p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) p.ApplyLinearImpulse( (ox * SIDE_ENGINE_POWER * s_power, oy * SIDE_ENGINE_POWER * s_power), impulse_pos, True, ) self.lander.ApplyLinearImpulse( (-ox * SIDE_ENGINE_POWER * s_power, -oy * SIDE_ENGINE_POWER * s_power), impulse_pos, True, ) self.world.Step(1.0 / FPS, 6 * 30, 2 * 30) pos = self.lander.position vel = self.lander.linearVelocity state = [ (pos.x - VIEWPORT_W / SCALE / 2) / (VIEWPORT_W / SCALE / 2), (pos.y - (self.helipad_y + LEG_DOWN / SCALE)) / (VIEWPORT_H / SCALE / 2), vel.x * (VIEWPORT_W / SCALE / 2) / FPS, vel.y * (VIEWPORT_H / SCALE / 2) / FPS, self.lander.angle, 20.0 * self.lander.angularVelocity / FPS, 1.0 if self.legs[0].ground_contact else 0.0, 1.0 if self.legs[1].ground_contact else 0.0, ] assert len(state) == 8 reward = 0 shaping = (-100 * np.sqrt(state[0] * state[0] + state[1] * state[1]) - 100 * np.sqrt(state[2] * state[2] + state[3] * state[3]) - 100 * abs(state[4]) + 10 * state[6] + 10 * state[7] ) # And ten points for legs contact, the idea is if you # lose contact again after landing, you get negative reward if self.prev_shaping is not None: reward = shaping - self.prev_shaping self.prev_shaping = shaping reward -= ( m_power * 0.30 ) # less fuel spent is better, about -30 for heuristic landing reward -= s_power * 0.03 done = False if self.game_over or abs(state[0]) >= 1.0: done = True reward = -100 if not self.lander.awake: done = True reward = +100 self.renderer.render_step() return np.array(state, dtype=np.float32), reward, done, {} def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode="human"): assert mode in self.metadata["render_modes"] try: import pygame from pygame import gfxdraw except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[box2d]`") if self.screen is None and mode == "human": pygame.init() pygame.display.init() self.screen = pygame.display.set_mode((VIEWPORT_W, VIEWPORT_H)) if self.clock is None: self.clock = pygame.time.Clock() self.surf = pygame.Surface((VIEWPORT_W, VIEWPORT_H)) pygame.transform.scale(self.surf, (SCALE, SCALE)) pygame.draw.rect(self.surf, (255, 255, 255), self.surf.get_rect()) for obj in self.particles: obj.ttl -= 0.15 obj.color1 = ( int(max(0.2, 0.15 + obj.ttl) * 255), int(max(0.2, 0.5 * obj.ttl) * 255), int(max(0.2, 0.5 * obj.ttl) * 255), ) obj.color2 = ( int(max(0.2, 0.15 + obj.ttl) * 255), int(max(0.2, 0.5 * obj.ttl) * 255), int(max(0.2, 0.5 * obj.ttl) * 255), ) self._clean_particles(False) for p in self.sky_polys: scaled_poly = [] for coord in p: scaled_poly.append((coord[0] * SCALE, coord[1] * SCALE)) pygame.draw.polygon(self.surf, (0, 0, 0), scaled_poly) gfxdraw.aapolygon(self.surf, scaled_poly, (0, 0, 0)) for obj in self.particles + self.drawlist: for f in obj.fixtures: trans = f.body.transform if type(f.shape) is circleShape: pygame.draw.circle( self.surf, color=obj.color1, center=trans * f.shape.pos * SCALE, radius=f.shape.radius * SCALE, ) pygame.draw.circle( self.surf, color=obj.color2, center=trans * f.shape.pos * SCALE, radius=f.shape.radius * SCALE, ) else: path = [trans * v * SCALE for v in f.shape.vertices] pygame.draw.polygon(self.surf, color=obj.color1, points=path) gfxdraw.aapolygon(self.surf, path, obj.color1) pygame.draw.aalines(self.surf, color=obj.color2, points=path, closed=True) for x in [self.helipad_x1, self.helipad_x2]: x = x * SCALE flagy1 = self.helipad_y * SCALE flagy2 = flagy1 + 50 pygame.draw.line( self.surf, color=(255, 255, 255), start_pos=(x, flagy1), end_pos=(x, flagy2), width=1, ) pygame.draw.polygon( self.surf, color=(204, 204, 0), points=[ (x, flagy2), (x, flagy2 - 10), (x + 25, flagy2 - 5), ], ) gfxdraw.aapolygon( self.surf, [(x, flagy2), (x, flagy2 - 10), (x + 25, flagy2 - 5)], (204, 204, 0), ) self.surf = pygame.transform.flip(self.surf, False, True) if mode == "human": self.screen.blit(self.surf, (0, 0)) pygame.event.pump() self.clock.tick(self.metadata["render_fps"]) pygame.display.flip() elif mode in {"rgb_array", "single_rgb_array"}: return np.transpose(np.array(pygame.surfarray.pixels3d(self.surf)), axes=(1, 0, 2)) def close(self): if self.screen is not None: import pygame pygame.display.quit() pygame.quit() self.isopen = False
class BlackjackEnv(gym.Env): """ Blackjack is a card game where the goal is to beat the dealer by obtaining cards that sum to closer to 21 (without going over 21) than the dealers cards. ### Description Card Values: - Face cards (Jack, Queen, King) have a point value of 10. - Aces can either count as 11 (called a 'usable ace') or 1. - Numerical cards (2-9) have a value equal to their number. This game is played with an infinite deck (or with replacement). The game starts with the dealer having one face up and one face down card, while the player has two face up cards. The player can request additional cards (hit, action=1) until they decide to stop (stick, action=0) or exceed 21 (bust, immediate loss). After the player sticks, the dealer reveals their facedown card, and draws until their sum is 17 or greater. If the dealer goes bust, the player wins. If neither the player nor the dealer busts, the outcome (win, lose, draw) is decided by whose sum is closer to 21. ### Action Space There are two actions: stick (0), and hit (1). ### Observation Space The observation consists of a 3-tuple containing: the player's current sum, the value of the dealer's one showing card (1-10 where 1 is ace), and whether the player holds a usable ace (0 or 1). This environment corresponds to the version of the blackjack problem described in Example 5.1 in Reinforcement Learning: An Introduction by Sutton and Barto (http://incompleteideas.net/book/the-book-2nd.html). ### Rewards - win game: +1 - lose game: -1 - draw game: 0 - win game with natural blackjack: +1.5 (if <a href="#nat">natural</a> is True) +1 (if <a href="#nat">natural</a> is False) ### Arguments ``` gym.make('Blackjack-v1', natural=False, sab=False) ``` <a id="nat">`natural=False`</a>: Whether to give an additional reward for starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21). <a id="sab">`sab=False`</a>: Whether to follow the exact rules outlined in the book by Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored. If the player achieves a natural blackjack and the dealer does not, the player will win (i.e. get a reward of +1). The reverse rule does not apply. If both the player and the dealer get a natural, it will be a draw (i.e. reward 0). ### Version History * v0: Initial versions release (1.0.0) """ metadata = { "render_modes": ["human", "rgb_array", "single_rgb_array"], "render_fps": 4, } def __init__(self, render_mode: Optional[str] = None, natural=False, sab=False): self.action_space = spaces.Discrete(2) self.observation_space = spaces.Tuple( (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2)) ) # Flag to payout 1.5 on a "natural" blackjack win, like casino rules # Ref: http://www.bicyclecards.com/how-to-play/blackjack/ self.natural = natural # Flag for full agreement with the (Sutton and Barto, 2018) definition. Overrides self.natural self.sab = sab assert render_mode is None or render_mode in self.metadata["render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) def step(self, action): assert self.action_space.contains(action) if action: # hit: add a card to players hand and return self.player.append(draw_card(self.np_random)) if is_bust(self.player): done = True reward = -1.0 else: done = False reward = 0.0 else: # stick: play out the dealers hand, and score done = True while sum_hand(self.dealer) < 17: self.dealer.append(draw_card(self.np_random)) reward = cmp(score(self.player), score(self.dealer)) if self.sab and is_natural(self.player) and not is_natural(self.dealer): # Player automatically wins. Rules consistent with S&B reward = 1.0 elif ( not self.sab and self.natural and is_natural(self.player) and reward == 1.0 ): # Natural gives extra points, but doesn't autowin. Legacy implementation reward = 1.5 self.renderer.render_step() return self._get_obs(), reward, done, {} def _get_obs(self): return (sum_hand(self.player), self.dealer[0], usable_ace(self.player)) def reset( self, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) self.dealer = draw_hand(self.np_random) self.player = draw_hand(self.np_random) self.renderer.reset() self.renderer.render_step() if not return_info: return self._get_obs() else: return self._get_obs(), {} def render(self, mode="human"): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render(mode) def _render(self, mode): assert mode in self.metadata["render_modes"] try: import pygame except ImportError: raise DependencyNotInstalled( "pygame is not installed, run `pip install gym[toy_text]`" ) player_sum, dealer_card_value, usable_ace = self._get_obs() screen_width, screen_height = 600, 500 card_img_height = screen_height // 3 card_img_width = int(card_img_height * 142 / 197) spacing = screen_height // 20 bg_color = (7, 99, 36) white = (255, 255, 255) if not hasattr(self, "screen"): pygame.init() if mode == "human": pygame.display.init() self.screen = pygame.display.set_mode((screen_width, screen_height)) else: pygame.font.init() self.screen = pygame.Surface((screen_width, screen_height)) if not hasattr(self, "clock"): self.clock = pygame.time.Clock() self.screen.fill(bg_color) def get_image(path): cwd = os.path.dirname(__file__) image = pygame.image.load(os.path.join(cwd, path)) return image def get_font(path, size): cwd = os.path.dirname(__file__) font = pygame.font.Font(os.path.join(cwd, path), size) return font small_font = get_font( os.path.join("font", "Minecraft.ttf"), screen_height // 15 ) dealer_text = small_font.render( "Dealer: " + str(dealer_card_value), True, white ) dealer_text_rect = self.screen.blit(dealer_text, (spacing, spacing)) suits = ["C", "D", "H", "S"] dealer_card_suit = self.np_random.choice(suits) if dealer_card_value == 1: dealer_card_value_str = "A" elif dealer_card_value == 10: dealer_card_value_str = self.np_random.choice(["J", "Q", "K"]) else: dealer_card_value_str = str(dealer_card_value) def scale_card_img(card_img): return pygame.transform.scale(card_img, (card_img_width, card_img_height)) dealer_card_img = scale_card_img( get_image( os.path.join("img", dealer_card_suit + dealer_card_value_str + ".png") ) ) dealer_card_rect = self.screen.blit( dealer_card_img, ( screen_width // 2 - card_img_width - spacing // 2, dealer_text_rect.bottom + spacing, ), ) hidden_card_img = scale_card_img(get_image(os.path.join("img", "Card.png"))) self.screen.blit( hidden_card_img, ( screen_width // 2 + spacing // 2, dealer_text_rect.bottom + spacing, ), ) player_text = small_font.render("Player", True, white) player_text_rect = self.screen.blit( player_text, (spacing, dealer_card_rect.bottom + 1.5 * spacing) ) large_font = get_font(os.path.join("font", "Minecraft.ttf"), screen_height // 6) player_sum_text = large_font.render(str(player_sum), True, white) player_sum_text_rect = self.screen.blit( player_sum_text, ( screen_width // 2 - player_sum_text.get_width() // 2, player_text_rect.bottom + spacing, ), ) if usable_ace: usable_ace_text = small_font.render("usable ace", True, white) self.screen.blit( usable_ace_text, ( screen_width // 2 - usable_ace_text.get_width() // 2, player_sum_text_rect.bottom + spacing // 2, ), ) if mode == "human": pygame.event.pump() pygame.display.update() self.clock.tick(self.metadata["render_fps"]) else: return np.transpose( np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2) ) def close(self): if hasattr(self, "screen"): import pygame pygame.display.quit() pygame.quit()
class MujocoEnv(gym.Env): """Superclass for all MuJoCo environments.""" def __init__( self, model_path, frame_skip, render_mode: Optional[str] = None, mujoco_bindings="mujoco", ): if model_path.startswith("/"): fullpath = model_path else: fullpath = path.join(path.dirname(__file__), "assets", model_path) if not path.exists(fullpath): raise OSError(f"File {fullpath} does not exist") if mujoco_bindings == "mujoco_py": logger.warn( "This version of the mujoco environments depends " "on the mujoco-py bindings, which are no longer maintained " "and may stop working. Please upgrade to the v4 versions of " "the environments (which depend on the mujoco python bindings instead), unless " "you are trying to precisely replicate previous works).") try: import mujoco_py self._mujoco_bindings = mujoco_py except ImportError as e: raise error.DependencyNotInstalled( "{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)" .format(e)) self.model = self._mujoco_bindings.load_model_from_path(fullpath) self.sim = self._mujoco_bindings.MjSim(self.model) self.data = self.sim.data elif mujoco_bindings == "mujoco": try: import mujoco self._mujoco_bindings = mujoco except ImportError as e: raise error.DependencyNotInstalled( f"{e}. (HINT: you need to install mujoco)") self.model = self._mujoco_bindings.MjModel.from_xml_path(fullpath) self.data = self._mujoco_bindings.MjData(self.model) self.init_qpos = self.data.qpos.ravel().copy() self.init_qvel = self.data.qvel.ravel().copy() self._viewers = {} self.frame_skip = frame_skip self.viewer = None self.metadata = { "render_modes": [ "human", "rgb_array", "depth_array", "single_rgb_array", "single_depth_array", ], "render_fps": int(np.round(1.0 / self.dt)), } self._set_action_space() assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) action = self.action_space.sample() observation, _reward, done, _info = self.step(action) assert not done self._set_observation_space(observation) def _set_action_space(self): bounds = self.model.actuator_ctrlrange.copy().astype(np.float32) low, high = bounds.T self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) return self.action_space def _set_observation_space(self, observation): self.observation_space = convert_observation_to_space(observation) return self.observation_space # methods to override: # ---------------------------- def reset_model(self): """ Reset the robot degrees of freedom (qpos and qvel). Implement this in each subclass. """ raise NotImplementedError def viewer_setup(self): """ This method is called when the viewer is initialized. Optionally implement this method, if you need to tinker with camera position and so forth. """ # ----------------------------- def reset( self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): super().reset(seed=seed) if self._mujoco_bindings.__name__ == "mujoco_py": self.sim.reset() else: self._mujoco_bindings.mj_resetData(self.model, self.data) ob = self.reset_model() self.renderer.reset() self.renderer.render_step() if not return_info: return ob else: return ob, {} def set_state(self, qpos, qvel): assert qpos.shape == (self.model.nq, ) and qvel.shape == ( self.model.nv, ) if self._mujoco_bindings.__name__ == "mujoco_py": state = self.sim.get_state() state = self._mujoco_bindings.MjSimState(state.time, qpos, qvel, state.act, state.udd_state) self.sim.set_state(state) self.sim.forward() else: self.data.qpos[:] = np.copy(qpos) self.data.qvel[:] = np.copy(qvel) if self.model.na == 0: self.data.act[:] = None self._mujoco_bindings.mj_forward(self.model, self.data) @property def dt(self): return self.model.opt.timestep * self.frame_skip def do_simulation(self, ctrl, n_frames): if np.array(ctrl).shape != self.action_space.shape: raise ValueError("Action dimension mismatch") if self._mujoco_bindings.__name__ == "mujoco_py": self.sim.data.ctrl[:] = ctrl else: self.data.ctrl[:] = ctrl for _ in range(n_frames): if self._mujoco_bindings.__name__ == "mujoco_py": self.sim.step() else: self._mujoco_bindings.mj_step(self.model, self.data) # As of MuJoCo 2.0, force-related quantities like cacc are not computed # unless there's a force sensor in the model. # See https://github.com/openai/gym/issues/1541 if self._mujoco_bindings.__name__ != "mujoco_py": self._mujoco_bindings.mj_rnePostConstraint(self.model, self.data) def render( self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE, camera_id=None, camera_name=None, ): if self.render_mode is not None: return self.renderer.get_renders() else: return self._render( mode=mode, width=width, height=height, camera_id=camera_id, camera_name=camera_name, ) def _render( self, mode="human", width=DEFAULT_SIZE, height=DEFAULT_SIZE, camera_id=None, camera_name=None, ): assert mode in self.metadata["render_modes"] if mode in { "rgb_array", "single_rgb_array", "depth_array", "single_depth_array", }: if camera_id is not None and camera_name is not None: raise ValueError("Both `camera_id` and `camera_name` cannot be" " specified at the same time.") no_camera_specified = camera_name is None and camera_id is None if no_camera_specified: camera_name = "track" if camera_id is None: if self._mujoco_bindings.__name__ == "mujoco_py": if camera_name in self.model._camera_name2id: camera_id = self.model.camera_name2id(camera_name) else: camera_id = self._mujoco_bindings.mj_name2id( self.model, self._mujoco_bindings.mjtObj.mjOBJ_CAMERA, camera_name, ) self._get_viewer(mode).render(width, height, camera_id=camera_id) if mode in {"rgb_array", "single_rgb_array"}: data = self._get_viewer(mode).read_pixels(width, height, depth=False) # original image is upside-down, so flip it return data[::-1, :, :] elif mode in {"depth_array", "single_depth_array"}: self._get_viewer(mode).render(width, height) # Extract depth part of the read_pixels() tuple data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1] # original image is upside-down, so flip it return data[::-1, :] elif mode == "human": self._get_viewer(mode).render() def close(self): if self.viewer is not None: if self._mujoco_bindings.__name__ == "mujoco": self.viewer.close() self.viewer = None self._viewers = {} def _get_viewer(self, mode, width=DEFAULT_SIZE, height=DEFAULT_SIZE): self.viewer = self._viewers.get(mode) if self.viewer is None: if mode == "human": if self._mujoco_bindings.__name__ == "mujoco_py": self.viewer = self._mujoco_bindings.MjViewer(self.sim) else: from gym.envs.mujoco.mujoco_rendering import Viewer self.viewer = Viewer(self.model, self.data) elif mode in { "rgb_array", "depth_array", "single_rgb_array", "single_depth_array", }: if self._mujoco_bindings.__name__ == "mujoco_py": self.viewer = self._mujoco_bindings.MjRenderContextOffscreen( self.sim, -1) else: from gym.envs.mujoco.mujoco_rendering import RenderContextOffscreen self.viewer = RenderContextOffscreen( width, height, self.model, self.data) self.viewer_setup() self._viewers[mode] = self.viewer return self.viewer def get_body_com(self, body_name): if self._mujoco_bindings.__name__ == "mujoco_py": return self.data.get_body_xpos(body_name) else: return self.data.body(body_name).xpos def state_vector(self): return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
def __init__( self, render_mode: Optional[str] = None, continuous: bool = False, gravity: float = -10.0, enable_wind: bool = False, wind_power: float = 15.0, turbulence_power: float = 1.5, ): EzPickle.__init__(self) assert ( -12.0 < gravity and gravity < 0.0 ), f"gravity (current value: {gravity}) must be between -12 and 0" self.gravity = gravity if 0.0 > wind_power or wind_power > 20.0: warnings.warn( colorize( f"WARN: wind_power value is recommended to be between 0.0 and 20.0, (current value: {wind_power})", "yellow", ), ) self.wind_power = wind_power if 0.0 > turbulence_power or turbulence_power > 2.0: warnings.warn( colorize( f"WARN: turbulence_power value is recommended to be between 0.0 and 2.0, (current value: {turbulence_power})", "yellow", ), ) self.turbulence_power = turbulence_power self.enable_wind = enable_wind self.wind_idx = np.random.randint(-9999, 9999) self.torque_idx = np.random.randint(-9999, 9999) self.screen = None self.clock = None self.isopen = True self.world = Box2D.b2World(gravity=(0, gravity)) self.moon = None self.lander = None self.particles = [] self.prev_reward = None self.continuous = continuous low = np.array([ # these are bounds for position # realistically the environment should have ended # long before we reach more than 50% outside -1.5, -1.5, # velocity bounds is 5x rated speed -5.0, -5.0, -math.pi, -5.0, -0.0, -0.0, ]).astype(np.float32) high = np.array([ # these are bounds for position # realistically the environment should have ended # long before we reach more than 50% outside 1.5, 1.5, # velocity bounds is 5x rated speed 5.0, 5.0, math.pi, 5.0, 1.0, 1.0, ]).astype(np.float32) # useful range is -1 .. +1, but spikes can be higher self.observation_space = spaces.Box(low, high) if self.continuous: # Action is two floats [main engine, left-right engines]. # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off self.action_space = spaces.Box(-1, +1, (2, ), dtype=np.float32) else: # Nop, fire left engine, main engine, right engine self.action_space = spaces.Discrete(4) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render)
def __init__( self, render_mode: Optional[str] = None, desc=None, map_name="4x4", is_slippery=True, ): if desc is None and map_name is None: desc = generate_random_map() elif desc is None: desc = MAPS[map_name] self.desc = desc = np.asarray(desc, dtype="c") self.nrow, self.ncol = nrow, ncol = desc.shape self.reward_range = (0, 1) nA = 4 nS = nrow * ncol self.initial_state_distrib = np.array( desc == b"S").astype("float64").ravel() self.initial_state_distrib /= self.initial_state_distrib.sum() self.P = {s: {a: [] for a in range(nA)} for s in range(nS)} def to_s(row, col): return row * ncol + col def inc(row, col, a): if a == LEFT: col = max(col - 1, 0) elif a == DOWN: row = min(row + 1, nrow - 1) elif a == RIGHT: col = min(col + 1, ncol - 1) elif a == UP: row = max(row - 1, 0) return (row, col) def update_probability_matrix(row, col, action): newrow, newcol = inc(row, col, action) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] done = bytes(newletter) in b"GH" reward = float(newletter == b"G") return newstate, reward, done for row in range(nrow): for col in range(ncol): s = to_s(row, col) for a in range(4): li = self.P[s][a] letter = desc[row, col] if letter in b"GH": li.append((1.0, s, 0, True)) else: if is_slippery: for b in [(a - 1) % 4, a, (a + 1) % 4]: li.append( (1.0 / 3.0, *update_probability_matrix(row, col, b))) else: li.append( (1.0, *update_probability_matrix(row, col, a))) self.observation_space = spaces.Discrete(nS) self.action_space = spaces.Discrete(nA) assert render_mode is None or render_mode in self.metadata[ "render_modes"] self.render_mode = render_mode self.renderer = Renderer(self.render_mode, self._render) # pygame utils self.window_size = (min(64 * ncol, 512), min(64 * nrow, 512)) self.window_surface = None self.clock = None self.hole_img = None self.cracked_hole_img = None self.ice_img = None self.elf_images = None self.goal_img = None self.start_img = None