class CartPole(Environment): """ The Inverted Pendulum on a Cart environment as presented in: "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003. """ def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (float, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info) def reset(self, state=None): if state is None: angle = np.random.uniform(-np.pi / 8., np.pi / 8.) self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_u = 0 return self._state def step(self, action): if action == 0: u = -self._max_u elif action == 1: u = 0. else: u = self._max_u self._last_u = u u += np.random.uniform(-self._noise_u, self._noise_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u, )) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if np.abs(self._state[0]) > np.pi * .5: reward = -1. absorbing = True else: reward = 0. absorbing = False return self._state, reward, absorbing, {} def render(self, mode='human'): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.square(start, 0, self._l / 10) self._viewer.circle(end, self._l / 20) direction = -np.sign(self._last_u) * np.array([1, 0]) value = np.abs(self._last_u) self._viewer.force_arrow(start, direction, value, self._max_u, self._l / 5) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _dynamics(self, state, t, u): theta = state[0] omega = state[1] d_theta = omega d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l * .5 * d_theta**2 * np.sin(2 * theta) * .5 - self._alpha * np.cos(theta) * u) / ( 2 / 3 * self._l - self._alpha * self._m * self._l * .5 * np.cos(theta)**2) return d_theta, d_omega
class PuddleWorld(Environment): """ Puddle world as presented in: "Off-Policy Actor-Critic". Degris T. et al.. 2012. """ def __init__(self, start=None, goal=None, goal_threshold=.1, noise_step=.025, noise_reward=0, reward_goal=0., thrust=.05, puddle_center=None, puddle_width=None, gamma=.99, horizon=5000): """ Constructor. Args: start (np.array, None): starting position of the agent; goal (np.array, None): goal position; goal_threshold (float, .1): distance threshold of the agent from the goal to consider it reached; noise_step (float, .025): noise in actions; noise_reward (float, 0): standard deviation of gaussian noise in reward; reward_goal (float, 0): reward obtained reaching goal state; thrust (float, .05): distance walked during each action; puddle_center (np.array, None): center of the puddle; puddle_width (np.array, None): width of the puddle; """ # MDP parameters self._start = np.array([.2, .4]) if start is None else start self._goal = np.array([1., 1.]) if goal is None else goal self._goal_threshold = goal_threshold self._noise_step = noise_step self._noise_reward = noise_reward self._reward_goal = reward_goal self._thrust = thrust puddle_center = [[.3, .6], [.4, .5], [.8, .9]] if puddle_center is None else puddle_center self._puddle_center = [np.array(center) for center in puddle_center] puddle_width = [[.1, .03], [.03, .1], [.03, .1]] if puddle_width is None else puddle_width self._puddle_width = [np.array(width) for width in puddle_width] self._actions = [np.zeros(2) for _ in range(5)] for i in range(4): self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1) # MDP properties action_space = Discrete(5) observation_space = Box(0., 1., shape=(2,)) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._pixels = None self._viewer = Viewer(1.0, 1.0) super().__init__(mdp_info) def reset(self, state=None): if state is None: self._state = self._start.copy() else: self._state = state return self._state def step(self, action): idx = action[0] self._state += self._actions[idx] + np.random.uniform( low=-self._noise_step, high=self._noise_step, size=(2,)) self._state = np.clip(self._state, 0., 1.) absorbing = np.linalg.norm((self._state - self._goal), ord=1) < self._goal_threshold if not absorbing: reward = np.random.randn() * self._noise_reward + self._get_reward( self._state) else: reward = self._reward_goal return self._state, reward, absorbing, {} def render(self): if self._pixels is None: img_size = 100 pixels = np.zeros((img_size, img_size, 3)) for i in range(img_size): for j in range(img_size): x = i / img_size y = j / img_size pixels[i, img_size - 1 - j] = self._get_reward( np.array([x, y])) pixels -= pixels.min() pixels *= 255. / pixels.max() self._pixels = np.floor(255 - pixels) self._viewer.background_image(self._pixels) self._viewer.circle(self._state, 0.01, color=(0, 255, 0)) goal_area = [ [-self._goal_threshold, 0], [0, self._goal_threshold], [self._goal_threshold, 0], [0, -self._goal_threshold] ] self._viewer.polygon(self._goal, 0, goal_area, color=(255, 0, 0), width=1) self._viewer.display(0.1) def stop(self): if self._viewer is not None: self._viewer.close() def _get_reward(self, state): reward = -1. for cen, wid in zip(self._puddle_center, self._puddle_width): reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf( state[1], cen[1], wid[1]) return reward
class AbstractGridWorld(Environment): """ Abstract class to build a grid world. """ def __init__(self, mdp_info, height, width, start, goal): """ Constructor. Args: height (int): height of the grid; width (int): width of the grid; start (tuple): x-y coordinates of the goal; goal (tuple): x-y coordinates of the goal. """ assert not np.array_equal(start, goal) assert goal[0] < height and goal[1] < width,\ 'Goal position not suitable for the grid world dimension.' self._state = None self._height = height self._width = width self._start = start self._goal = goal # Visualization self._viewer = Viewer(self._width, self._height, 500, self._height * 500 // self._width) super().__init__(mdp_info) def reset(self, state=None): if state is None: state = self.convert_to_int(self._start, self._width) self._state = state return self._state def step(self, action): state = self.convert_to_grid(self._state, self._width) new_state, reward, absorbing, info = self._step(state, action) self._state = self.convert_to_int(new_state, self._width) return self._state, reward, absorbing, info def render(self): for row in range(1, self._height): for col in range(1, self._width): self._viewer.line(np.array([col, 0]), np.array([col, self._height])) self._viewer.line(np.array([0, row]), np.array([self._width, row])) goal_center = np.array( [.5 + self._goal[1], self._height - (.5 + self._goal[0])]) self._viewer.square(goal_center, 0, 1, (0, 255, 0)) start_grid = self.convert_to_grid(self._start, self._width) start_center = np.array( [.5 + start_grid[1], self._height - (.5 + start_grid[0])]) self._viewer.square(start_center, 0, 1, (255, 0, 0)) state_grid = self.convert_to_grid(self._state, self._width) state_center = np.array( [.5 + state_grid[1], self._height - (.5 + state_grid[0])]) self._viewer.circle(state_center, .4, (0, 0, 255)) self._viewer.display(.1) def _step(self, state, action): raise NotImplementedError('AbstractGridWorld is an abstract class.') def _grid_step(self, state, action): if action == 0: if state[0] > 0: state[0] -= 1 elif action == 1: if state[0] + 1 < self._height: state[0] += 1 elif action == 2: if state[1] > 0: state[1] -= 1 elif action == 3: if state[1] + 1 < self._width: state[1] += 1 @staticmethod def convert_to_grid(state, width): return np.array([state[0] // width, state[0] % width]) @staticmethod def convert_to_int(state, width): return np.array([state[0] * width + state[1]])
class Segway(Environment): """ The Segway environment (continuous version) as presented in: "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015. """ def __init__(self, random_start=False): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one. """ # MDP parameters gamma = 0.97 self._Mr = 0.3 * 2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4 * 2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi / 2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5 * self._l, 5 * self._l) self._last_x = 0 super().__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi / 2, np.pi / 2) else: angle = -np.pi / 8 self._state = np.array([angle, 0., 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_x = 0 return self._state def step(self, action): u = self._bound(action[0], -self._max_u, self._max_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u, )) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if abs(self._state[0]) > np.pi / 2: absorbing = True reward = -10000 else: absorbing = False Q = np.diag([3.0, 0.1, 0.1]) x = self._state J = x.dot(Q).dot(x) reward = -J return self._state, reward, absorbing, {} def _dynamics(self, state, t, u): alpha = state[0] d_alpha = state[1] h1 = (self._Mr + self._Mp) * (self._r**2) + self._Ir h2 = self._Mp * self._r * self._l * np.cos(alpha) h3 = self._l**2 * self._Mp + self._Ip omegaP = d_alpha dOmegaP = -(h2 * self._l * self._Mp * self._r * np.sin(alpha) * omegaP **2 - self._g * h1 * self._l * self._Mp * np.sin(alpha) + (h2 + h1) * u) / (h1 * h3 - h2**2) dOmegaR = ( h3 * self._l * self._Mp * self._r * np.sin(alpha) * omegaP**2 - self._g * h2 * self._l * self._Mp * np.sin(alpha) + (h3 + h2) * u) / (h1 * h3 - h2**2) dx = list() dx.append(omegaP) dx.append(dOmegaP) dx.append(dOmegaR) return dx def render(self, mode='human'): start = 2.5 * self._l * np.ones(2) end = 2.5 * self._l * np.ones(2) dx = -self._state[2] * self._r * self._dt self._last_x += dx if self._last_x > 2.5 * self._l or self._last_x < -2.5 * self._l: self._last_x = (2.5 * self._l + self._last_x) % (5 * self._l) - 2.5 * self._l start[0] += self._last_x end[0] += -2 * self._l * np.sin(self._state[0]) + self._last_x end[1] += 2 * self._l * np.cos(self._state[0]) if (start[0] > 5 * self._l and end[0] > 5 * self._l) \ or (start[0] < 0 and end[0] < 0): start[0] = start[0] % 5 * self._l end[0] = end[0] % 5 * self._l self._viewer.line(start, end) self._viewer.circle(start, self._r) self._viewer.display(self._dt)
class InvertedPendulum(Environment): """ The Inverted Pendulum environment (continuous version) as presented in: "Reinforcement Learning In Continuous Time and Space". Doya K.. 2000. "Off-Policy Actor-Critic". Degris T. et al.. 2012. "Deterministic Policy Gradient Algorithms". Silver D. et al. 2014. """ def __init__(self, random_start=False, m=1., l=1., g=9.8, mu=1e-2, max_u=5., horizon=5000, gamma=.99): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one; m (float, 1.0): mass of the pendulum; l (float, 1.0): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 5.0): maximum allowed input torque; horizon (int, 5000): horizon of the problem; gamma (int, .99): discount factor. """ # MDP parameters self._m = m self._l = l self._g = g self._mu = mu self._random = random_start self._dt = .01 self._max_u = max_u self._max_omega = 5 / 2 * np.pi high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi, np.pi) else: angle = np.pi / 2 self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega) self._last_u = 0.0 return self._state def step(self, action): u = self._bound(action[0], -self._max_u, self._max_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u,)) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega) reward = np.cos(self._state[0]) self._last_u = u.item() return self._state, reward, False, {} def render(self, mode='human'): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.circle(start, self._l / 40) self._viewer.circle(end, self._l / 20) self._viewer.torque_arrow(start, -self._last_u, self._max_u, self._l / 5) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _dynamics(self, state, t, u): theta = state[0] omega = self._bound(state[1], -self._max_omega, self._max_omega) d_theta = omega d_omega = (-self._mu * omega + self._m * self._g * self._l * np.sin( theta) + u) / (self._m * self._l**2) return d_theta, d_omega
class RoomToyEnv(Environment): def __init__(self, size=5., goal=[2.5, 2.5], goal_radius=0.6): # Save important environment information self._size = size self._goal = np.array(goal) self._goal_radius = goal_radius # Create the action space. action_space = Discrete(4) # 4 actions: N, S, W, E # Create the observation space. It's a 2D box of dimension (size x size). # You can also specify low and high array, if every component has different limits shape = (2,) observation_space = Box(0, size, shape) # Create the MDPInfo structure, needed by the environment interface mdp_info = MDPInfo(observation_space, action_space, gamma=0.99, horizon=100) super().__init__(mdp_info) # Create a state class variable to store the current state self._state = None # Create the viewer self._viewer = Viewer(size, size) def reset(self, state=None): if state is None: # Generate randomly a state inside the state space, but not inside the goal self._state = np.random.rand(2) * self._size # Check if it's inside the goal radius and repeat the sample if necessary while np.linalg.norm(self._state - self._goal) < self._goal_radius: self._state = np.random.rand(2) * self._size else: # If an initial state is provided, set it and return, after checking it's valid. assert np.all(state < self._size) and np.all(state > 0) assert np.linalg.norm(state - self._goal) > self._goal_radius self._state = state # Return the current state return self._state def step(self, action): # convert the action in a N, S, W, E movement movement = np.zeros(2) if action == 0: movement[1] += 0.1 elif action == 1: movement[1] -= 0.1 elif action == 2: movement[0] -= 0.1 elif action == 3: movement[0] += 0.1 else: assert ValueError('The environment has only 4 actions') # Apply the movement with some noise: self._state += movement + np.random.randn(2)*0.05 # Clip the state space inside the boundaries. low = self.info.observation_space.low high = self.info.observation_space.high self._state = Environment._bound(self._state, low, high) # Compute distance form goal goal_distance = np.linalg.norm(self._state - self._goal) # Compute the reward as distance penalty from goal reward = -goal_distance # Set the absorbing flag if goal is reached absorbing = goal_distance < self._goal_radius # Return all the information + empty dictionary (used to pass additional information) return self._state, reward, absorbing, {} def render(self): # Draw a red circle for the agent self._viewer.circle(self._state, 0.1, color=(255, 0, 0)) # Draw a green circle for the goal self._viewer.circle(self._goal, self._goal_radius, color=(0, 255, 0)) # Display the image for 0.1 seconds self._viewer.display(0.1)