class CarOnHill(Environment): """ The Car On Hill environment as presented in: "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005. """ def __init__(self, horizon=100, gamma=.95): """ Constructor. Args: horizon (int, 100): horizon of the problem; gamma (float, .95): discount factor. """ # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1. self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(1, 1) super().__init__(mdp_info) def reset(self, state=None): if state is None: self._state = np.array([-0.5, 0]) else: self._state = state return self._state def step(self, action): action = self._discrete_actions[action[0]] sa = np.append(self._state, action) new_state = odeint(self._dpds, sa, [0, self._dt]) self._state = new_state[-1, :-1] if self._state[0] < -self.max_pos or \ np.abs(self._state[1]) > self.max_velocity: reward = -1 absorbing = True elif self._state[0] > self.max_pos and \ np.abs(self._state[1]) <= self.max_velocity: reward = 1 absorbing = True else: reward = 0 absorbing = False return self._state, reward, absorbing, {} def render(self): # Slope self._viewer.function(0, 1, self._height) # Car car_body = [ [-3e-2, 0], [-3e-2, 2e-2], [-2e-2, 2e-2], [-1e-2, 3e-2], [1e-2, 3e-2], [2e-2, 2e-2], [3e-2, 2e-2], [3e-2, 0] ] x_car = (self._state[0] + 1) / 2 y_car = self._height(x_car) c_car = [x_car, y_car] angle = self._angle(x_car) self._viewer.polygon(c_car, angle, car_body, color=(32, 193, 54)) self._viewer.display(self._dt) @staticmethod def _angle(x): if x < 0.5: m = 4 * x - 1 else: m = 1 / ((20 * x**2 - 20 * x + 6) ** 1.5) return np.arctan(m) @staticmethod def _height(x): y_neg = 4 * x**2 - 2 * x y_pos = (2 * x - 1) / np.sqrt(5 * (2 * x - 1)**2 + 1) y = np.zeros_like(x) mask = x < .5 neg_mask = np.logical_not(mask) y[mask] = y_neg[mask] y[neg_mask] = y_pos[neg_mask] y_norm = (y + 1) / 2 return y_norm def _dpds(self, state_action, t): position = state_action[0] velocity = state_action[1] u = state_action[-1] if position < 0.: diff_hill = 2 * position + 1 diff_2_hill = 2 else: diff_hill = 1 / ((1 + 5 * position ** 2) ** 1.5) diff_2_hill = (-15 * position) / ((1 + 5 * position ** 2) ** 2.5) dp = velocity ds = (u - self._g * self._m * diff_hill - velocity ** 2 * self._m * diff_hill * diff_2_hill) / (self._m * (1 + diff_hill ** 2)) return dp, ds, 0.
class ShipSteering(Environment): """ The Ship Steering environment as presented in: "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S.. 2013. """ def __init__(self, small=True, n_steps_action=3): """ Constructor. Args: small (bool, True): whether to use a small state space or not. n_steps_action (int, 3): number of integration intervals for each step of the mdp. """ # MDP parameters self.field_size = 150 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self._gate_s = np.empty(2) self._gate_e = np.empty(2) self._gate_s[0] = 100 if small else 350 self._gate_s[1] = 120 if small else 400 self._gate_e[0] = 120 if small else 450 self._gate_e[1] = 100 if small else 400 self._out_reward = -100 self._success_reward = 0 self._small = small self._state = None self.n_steps_action = n_steps_action # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super().__init__(mdp_info) def reset(self, state=None): if state is None: if self._small: self._state = np.zeros(4) self._state[2] = np.pi / 2 else: low = self.info.observation_space.low high = self.info.observation_space.high self._state = (high - low) * np.random.rand(4) + low else: self._state = state return self._state def step(self, action): r = self._bound(action[0], -self.omega_max, self.omega_max) new_state = self._state for _ in range(self.n_steps_action): state = new_state new_state = np.empty(4) new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt new_state[2] = normalize_angle(state[2] + state[3] * self._dt) new_state[3] = state[3] + (r - state[3]) * self._dt / self._T if new_state[0] > self.field_size \ or new_state[1] > self.field_size \ or new_state[0] < 0 or new_state[1] < 0: new_state[0] = self._bound(new_state[0], 0, self.field_size) new_state[1] = self._bound(new_state[1], 0, self.field_size) reward = self._out_reward absorbing = True break elif self._through_gate(state[:2], new_state[:2]): reward = self._success_reward absorbing = True break else: reward = -1 absorbing = False self._state = new_state return self._state, reward, absorbing, {} def render(self, mode='human'): self._viewer.line(self._gate_s, self._gate_e, width=3) boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]] self._viewer.polygon(self._state[:2], self._state[2], boat, color=(32, 193, 54)) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _through_gate(self, start, end): r = self._gate_e - self._gate_s s = end - start den = self._cross_2d(vecr=r, vecs=s) if den == 0: return False t = self._cross_2d((start - self._gate_s), s) / den u = self._cross_2d((start - self._gate_s), r) / den return 1 >= u >= 0 and 1 >= t >= 0 @staticmethod def _cross_2d(vecr, vecs): return vecr[0] * vecs[1] - vecr[1] * vecs[0]
class PuddleWorld(Environment): """ Puddle world as presented in: "Off-Policy Actor-Critic". Degris T. et al.. 2012. """ def __init__(self, start=None, goal=None, goal_threshold=.1, noise_step=.025, noise_reward=0, reward_goal=0., thrust=.05, puddle_center=None, puddle_width=None, gamma=.99, horizon=5000): """ Constructor. Args: start (np.array, None): starting position of the agent; goal (np.array, None): goal position; goal_threshold (float, .1): distance threshold of the agent from the goal to consider it reached; noise_step (float, .025): noise in actions; noise_reward (float, 0): standard deviation of gaussian noise in reward; reward_goal (float, 0): reward obtained reaching goal state; thrust (float, .05): distance walked during each action; puddle_center (np.array, None): center of the puddle; puddle_width (np.array, None): width of the puddle; """ # MDP parameters self._start = np.array([.2, .4]) if start is None else start self._goal = np.array([1., 1.]) if goal is None else goal self._goal_threshold = goal_threshold self._noise_step = noise_step self._noise_reward = noise_reward self._reward_goal = reward_goal self._thrust = thrust puddle_center = [[.3, .6], [.4, .5], [.8, .9]] if puddle_center is None else puddle_center self._puddle_center = [np.array(center) for center in puddle_center] puddle_width = [[.1, .03], [.03, .1], [.03, .1]] if puddle_width is None else puddle_width self._puddle_width = [np.array(width) for width in puddle_width] self._actions = [np.zeros(2) for _ in range(5)] for i in range(4): self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1) # MDP properties action_space = Discrete(5) observation_space = Box(0., 1., shape=(2,)) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._pixels = None self._viewer = Viewer(1.0, 1.0) super().__init__(mdp_info) def reset(self, state=None): if state is None: self._state = self._start.copy() else: self._state = state return self._state def step(self, action): idx = action[0] self._state += self._actions[idx] + np.random.uniform( low=-self._noise_step, high=self._noise_step, size=(2,)) self._state = np.clip(self._state, 0., 1.) absorbing = np.linalg.norm((self._state - self._goal), ord=1) < self._goal_threshold if not absorbing: reward = np.random.randn() * self._noise_reward + self._get_reward( self._state) else: reward = self._reward_goal return self._state, reward, absorbing, {} def render(self): if self._pixels is None: img_size = 100 pixels = np.zeros((img_size, img_size, 3)) for i in range(img_size): for j in range(img_size): x = i / img_size y = j / img_size pixels[i, img_size - 1 - j] = self._get_reward( np.array([x, y])) pixels -= pixels.min() pixels *= 255. / pixels.max() self._pixels = np.floor(255 - pixels) self._viewer.background_image(self._pixels) self._viewer.circle(self._state, 0.01, color=(0, 255, 0)) goal_area = [ [-self._goal_threshold, 0], [0, self._goal_threshold], [self._goal_threshold, 0], [0, -self._goal_threshold] ] self._viewer.polygon(self._goal, 0, goal_area, color=(255, 0, 0), width=1) self._viewer.display(0.1) def stop(self): if self._viewer is not None: self._viewer.close() def _get_reward(self, state): reward = -1. for cen, wid in zip(self._puddle_center, self._puddle_width): reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf( state[1], cen[1], wid[1]) return reward