def __init__(self, mdp_info, height, width, start, goal): """ Constructor. Args: height (int): height of the grid; width (int): width of the grid; start (tuple): x-y coordinates of the goal; goal (tuple): x-y coordinates of the goal. """ assert not np.array_equal(start, goal) assert goal[0] < height and goal[1] < width,\ 'Goal position not suitable for the grid world dimension.' self._state = None self._height = height self._width = width self._start = start self._goal = goal # Visualization self._viewer = Viewer(self._width, self._height, 500, self._height * 500 // self._width) super().__init__(mdp_info)
def __init__(self, n_steps_action=3, viz_speed=100, small=False): self.__name__ = 'ShipSteeringMultiGate' self.n_steps_action = n_steps_action self.viz_speed = viz_speed # MDP parameters self.no_of_gates = 4 self.small = small self.field_size = 500 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12., 0]) high = np.array([ self.field_size, self.field_size, np.pi, np.pi / 12., self.no_of_gates ]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 gate_1s = np.array([75, 175]) if small else np.array([150, 350]) gate_1e = np.array([125, 175]) if small else np.array([250, 350]) gate_1 = np.array([gate_1s, gate_1e]) gate_2s = np.array([150, 300]) if small else np.array([300, 600]) gate_2e = np.array([200, 300]) if small else np.array([400, 600]) gate_2 = np.array([gate_2s, gate_2e]) gate_3s = np.array([250, 350]) if small else np.array([500, 700]) gate_3e = np.array([300, 350]) if small else np.array([600, 700]) gate_3 = np.array([gate_3s, gate_3e]) gate_4s = np.array([150, 425]) if small else np.array([300, 850]) gate_4e = np.array([200, 425]) if small else np.array([400, 850]) gate_4 = np.array([gate_4s, gate_4e]) self._gate_list = gate_1, gate_2, gate_3, gate_4 # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 self._out_reward = -10000 self.correct_order = False mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteeringMultiGate, self).__init__(mdp_info)
def __init__(self, start=None, goal=None, goal_threshold=.1, noise_step=.025, noise_reward=0, reward_goal=0., thrust=.05, puddle_center=None, puddle_width=None, gamma=.99, horizon=5000): """ Constructor. Args: start (np.array, None): starting position of the agent; goal (np.array, None): goal position; goal_threshold (float, .1): distance threshold of the agent from the goal to consider it reached; noise_step (float, .025): noise in actions; noise_reward (float, 0): standard deviation of gaussian noise in reward; reward_goal (float, 0): reward obtained reaching goal state; thrust (float, .05): distance walked during each action; puddle_center (np.array, None): center of the puddle; puddle_width (np.array, None): width of the puddle; """ # MDP parameters self._start = np.array([.2, .4]) if start is None else start self._goal = np.array([1., 1.]) if goal is None else goal self._goal_threshold = goal_threshold self._noise_step = noise_step self._noise_reward = noise_reward self._reward_goal = reward_goal self._thrust = thrust puddle_center = [[.3, .6], [.4, .5], [.8, .9] ] if puddle_center is None else puddle_center self._puddle_center = [np.array(center) for center in puddle_center] puddle_width = [[.1, .03], [.03, .1], [.03, .1] ] if puddle_width is None else puddle_width self._puddle_width = [np.array(width) for width in puddle_width] self._actions = [np.zeros(2) for _ in range(5)] for i in range(4): self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1) # MDP properties action_space = Discrete(5) observation_space = Box(0., 1., shape=(2, )) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._pixels = None self._viewer = Viewer(1.0, 1.0) super().__init__(mdp_info)
def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (int, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info)
def __init__(self, random_start=False, m=1., l=1., g=9.8, mu=1e-2, max_u=5., horizon=5000, gamma=.99): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one; m (float, 1.0): mass of the pendulum; l (float, 1.0): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 5.0): maximum allowed input torque; horizon (int, 5000): horizon of the problem; gamma (int, .99): discount factor. """ # MDP parameters self._m = m self._l = l self._g = g self._mu = mu self._random = random_start self._dt = .01 self._max_u = max_u self._max_omega = 5 / 2 * np.pi high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info)
def __init__(self, random_start=False, goal_distance=1.0): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.99 self.Mr = 0.3 * 2 self.Mp = 2.55 self.Ip = 2.6e-2 self.Ir = 4.54e-4 * 2 self.l = 13.8e-2 self.r = 5.5e-2 self.dt = 1e-2 self.g = 9.81 self.max_u = 5 self._random = random_start self._goal_distance = goal_distance high = np.array([2 * self._goal_distance, np.pi, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self.max_u]), high=np.array([self.max_u])) horizon = 1500 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization env_width = 4 * goal_distance env_height = 2.5 * 2 * self.l width = 800 height = int(width * env_height / env_width) self._viewer = Viewer(env_width, env_height, width, height) super(SegwayLinearMotion, self).__init__(mdp_info)
def __init__(self, small=True, n_steps_action=3): """ Constructor. Args: small (bool, True): whether to use a small state space or not. n_steps_action (int, 3): number of integration intervals for each step of the mdp. """ # MDP parameters self.field_size = 150 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self._gate_s = np.empty(2) self._gate_e = np.empty(2) self._gate_s[0] = 100 if small else 350 self._gate_s[1] = 120 if small else 400 self._gate_e[0] = 120 if small else 450 self._gate_e[1] = 100 if small else 400 self._out_reward = -100 self._success_reward = 0 self._small = small self._state = None self.n_steps_action = n_steps_action # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteering, self).__init__(mdp_info)
def __init__(self, random_start=False): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.97 self._Mr = 0.3 * 2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4 * 2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi / 2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5 * self._l, 5 * self._l) self._last_x = 0 super(Segway, self).__init__(mdp_info)
def __init__(self, horizon=100, gamma=.95): """ Constructor. """ # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1. self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(1, 1) super().__init__(mdp_info)
def __init__(self, random_start=False): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.97 self._Mr = 0.3*2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4*2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi/2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5*self._l, 5*self._l) self._last_x = 0 super(Segway, self).__init__(mdp_info)
def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info)
def __init__(self, random_start=False, m=1.0, l=1.0, g=9.8, mu=1e-2, max_u=5.0, horizon=5000): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one; m (float, 1.0): mass of the pendulum; l (float, 1.0): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 5.0): maximum allowed input torque; horizon (int, 5000): horizon of the problem. """ # MDP parameters self._m = m self._l = l self._g = g self._mu = mu self._random = random_start self._dt = 0.01 self._max_u = max_u self._max_omega = 5 / 2 * np.pi high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info)
class ShipSteering(Environment): """ The Ship Steering environment as presented in: "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S.. 2013. """ def __init__(self, small=True, n_steps_action=3): """ Constructor. Args: small (bool, True): whether to use a small state space or not. n_steps_action (int, 3): number of integration intervals for each step of the mdp. """ # MDP parameters self.field_size = 150 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self._gate_s = np.empty(2) self._gate_e = np.empty(2) self._gate_s[0] = 100 if small else 350 self._gate_s[1] = 120 if small else 400 self._gate_e[0] = 120 if small else 450 self._gate_e[1] = 100 if small else 400 self._out_reward = -100 self._success_reward = 0 self._small = small self._state = None self.n_steps_action = n_steps_action # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteering, self).__init__(mdp_info) def reset(self, state=None): if state is None: if self._small: self._state = np.zeros(4) self._state[2] = np.pi / 2 else: low = self.info.observation_space.low high = self.info.observation_space.high self._state = (high - low) * np.random.rand(4) + low else: self._state = state return self._state def step(self, action): r = self._bound(action[0], -self.omega_max, self.omega_max) new_state = self._state for _ in range(self.n_steps_action): state = new_state new_state = np.empty(4) new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt new_state[2] = normalize_angle(state[2] + state[3] * self._dt) new_state[3] = state[3] + (r - state[3]) * self._dt / self._T if new_state[0] > self.field_size \ or new_state[1] > self.field_size \ or new_state[0] < 0 or new_state[1] < 0: new_state[0] = self._bound(new_state[0], 0, self.field_size) new_state[1] = self._bound(new_state[1], 0, self.field_size) reward = self._out_reward absorbing = True break elif self._through_gate(state[:2], new_state[:2]): reward = self._success_reward absorbing = True break else: reward = -1 absorbing = False self._state = new_state return self._state, reward, absorbing, {} def render(self, mode='human'): self._viewer.line(self._gate_s, self._gate_e, width=3) boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]] self._viewer.polygon(self._state[:2], self._state[2], boat, color=(32, 193, 54)) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _through_gate(self, start, end): r = self._gate_e - self._gate_s s = end - start den = self._cross_2d(vecr=r, vecs=s) if den == 0: return False t = self._cross_2d((start - self._gate_s), s) / den u = self._cross_2d((start - self._gate_s), r) / den return 1 >= u >= 0 and 1 >= t >= 0 @staticmethod def _cross_2d(vecr, vecs): return vecr[0] * vecs[1] - vecr[1] * vecs[0]
class CarOnHill(Environment): """ The Car On Hill environment as presented in: "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005. """ def __init__(self, horizon=100, gamma=.95): """ Constructor. """ # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1. self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(1, 1) super().__init__(mdp_info) def reset(self, state=None): if state is None: self._state = np.array([-0.5, 0]) else: self._state = state return self._state def step(self, action): action = self._discrete_actions[action[0]] sa = np.append(self._state, action) new_state = odeint(self._dpds, sa, [0, self._dt]) self._state = new_state[-1, :-1] if self._state[0] < -self.max_pos or \ np.abs(self._state[1]) > self.max_velocity: reward = -1 absorbing = True elif self._state[0] > self.max_pos and \ np.abs(self._state[1]) <= self.max_velocity: reward = 1 absorbing = True else: reward = 0 absorbing = False return self._state, reward, absorbing, {} def render(self): # Slope self._viewer.function(0, 1, self._height) # Car car_body = [[-3e-2, 0], [-3e-2, 2e-2], [-2e-2, 2e-2], [-1e-2, 3e-2], [1e-2, 3e-2], [2e-2, 2e-2], [3e-2, 2e-2], [3e-2, 0]] x_car = (self._state[0] + 1) / 2 y_car = self._height(x_car) c_car = [x_car, y_car] angle = self._angle(x_car) self._viewer.polygon(c_car, angle, car_body, color=(32, 193, 54)) self._viewer.display(self._dt) @staticmethod def _angle(x): if x < 0.5: m = 4 * x - 1 else: m = 1 / ((20 * x**2 - 20 * x + 6)**1.5) return np.arctan(m) @staticmethod def _height(x): y_neg = 4 * x**2 - 2 * x y_pos = (2 * x - 1) / np.sqrt(5 * (2 * x - 1)**2 + 1) y = np.zeros_like(x) mask = x < .5 neg_mask = np.logical_not(mask) y[mask] = y_neg[mask] y[neg_mask] = y_pos[neg_mask] y_norm = (y + 1) / 2 return y_norm def _dpds(self, state_action, t): position = state_action[0] velocity = state_action[1] u = state_action[-1] if position < 0.: diff_hill = 2 * position + 1 diff_2_hill = 2 else: diff_hill = 1 / ((1 + 5 * position**2)**1.5) diff_2_hill = (-15 * position) / ((1 + 5 * position**2)**2.5) dp = velocity ds = (u - self._g * self._m * diff_hill - velocity**2 * self._m * diff_hill * diff_2_hill) / (self._m * (1 + diff_hill**2)) return dp, ds, 0.
class Segway(Environment): """ The Segway environment (continuous version) as presented in: "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015. """ def __init__(self, random_start=False): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.97 self._Mr = 0.3*2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4*2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi/2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5*self._l, 5*self._l) self._last_x = 0 super(Segway, self).__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi/2, np.pi/2) else: angle = -np.pi/8 self._state = np.array([angle, 0., 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_x = 0 return self._state def step(self, action): u = self._bound(action[0], -self._max_u, self._max_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u,)) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if abs(self._state[0]) > np.pi / 2: absorbing = True reward = -10000 else: absorbing = False Q = np.diag([3.0, 0.1, 0.1]) x = self._state J = x.dot(Q).dot(x) reward = -J return self._state, reward, absorbing, {} def _dynamics(self, state, t, u): alpha = state[0] d_alpha = state[1] h1 = (self._Mr + self._Mp) * (self._r ** 2) + self._Ir h2 = self._Mp * self._r * self._l * np.cos(alpha) h3 = self._l ** 2 * self._Mp + self._Ip omegaP = d_alpha dOmegaP = -(h2*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 - self._g*h1*self._l*self._Mp*np.sin(alpha) + (h2 + h1)*u)\ / (h1*h3-h2**2) dOmegaR = (h3*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 - self._g*h2*self._l*self._Mp*np.sin(alpha) + (h3 + h2)*u)\ / (h1*h3-h2**2) dx = list() dx.append(omegaP) dx.append(dOmegaP) dx.append(dOmegaR) return dx def render(self, mode='human'): start = 2.5 * self._l * np.ones(2) end = 2.5 * self._l * np.ones(2) dx = -self._state[2] * self._r * self._dt self._last_x += dx if self._last_x > 2.5*self._l or self._last_x < -2.5*self._l: self._last_x = (2.5*self._l + self._last_x)%(5*self._l) \ - 2.5*self._l start[0] += self._last_x end[0] += -2*self._l*np.sin(self._state[0]) + self._last_x end[1] += 2*self._l*np.cos(self._state[0]) if (start[0] > 5 * self._l and end[0] > 5 * self._l) \ or (start[0] < 0 and end[0] < 0): start[0] = start[0]%5*self._l end[0] = end[0]%5*self._l self._viewer.line(start, end) self._viewer.circle(start, self._r) self._viewer.display(self._dt)
class Segway(Environment): """ The Segway environment (continuous version) as presented in: "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015. """ def __init__(self, random_start=False): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.97 self._Mr = 0.3 * 2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4 * 2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi / 2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5 * self._l, 5 * self._l) self._last_x = 0 super(Segway, self).__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi / 2, np.pi / 2) else: angle = -np.pi / 8 self._state = np.array([angle, 0., 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_x = 0 return self._state def step(self, action): u = self._bound(action[0], -self._max_u, self._max_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u, )) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if abs(self._state[0]) > np.pi / 2: absorbing = True reward = -10000 else: absorbing = False Q = np.diag([3.0, 0.1, 0.1]) x = self._state J = x.dot(Q).dot(x) reward = -J return self._state, reward, absorbing, {} def _dynamics(self, state, t, u): alpha = state[0] d_alpha = state[1] h1 = (self._Mr + self._Mp) * (self._r**2) + self._Ir h2 = self._Mp * self._r * self._l * np.cos(alpha) h3 = self._l**2 * self._Mp + self._Ip omegaP = d_alpha dOmegaP = -(h2*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 - self._g*h1*self._l*self._Mp*np.sin(alpha) + (h2 + h1)*u)\ / (h1*h3-h2**2) dOmegaR = (h3*self._l*self._Mp*self._r*np.sin(alpha)*omegaP**2 - self._g*h2*self._l*self._Mp*np.sin(alpha) + (h3 + h2)*u)\ / (h1*h3-h2**2) dx = list() dx.append(omegaP) dx.append(dOmegaP) dx.append(dOmegaR) return dx def render(self, mode='human'): start = 2.5 * self._l * np.ones(2) end = 2.5 * self._l * np.ones(2) dx = -self._state[2] * self._r * self._dt self._last_x += dx if self._last_x > 2.5 * self._l or self._last_x < -2.5 * self._l: self._last_x = (2.5*self._l + self._last_x)%(5*self._l) \ - 2.5*self._l start[0] += self._last_x end[0] += -2 * self._l * np.sin(self._state[0]) + self._last_x end[1] += 2 * self._l * np.cos(self._state[0]) if (start[0] > 5 * self._l and end[0] > 5 * self._l) \ or (start[0] < 0 and end[0] < 0): start[0] = start[0] % 5 * self._l end[0] = end[0] % 5 * self._l self._viewer.line(start, end) self._viewer.circle(start, self._r) self._viewer.display(self._dt)
class SegwayLinearMotion(Environment): """ The Segway environment (continuous version) as presented in: "Deep Learning for Actor-Critic Reinforcement Learning". Xueli Jia. 2015. """ def __init__(self, random_start=False, goal_distance=1.0): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.99 self.Mr = 0.3 * 2 self.Mp = 2.55 self.Ip = 2.6e-2 self.Ir = 4.54e-4 * 2 self.l = 13.8e-2 self.r = 5.5e-2 self.dt = 1e-2 self.g = 9.81 self.max_u = 5 self._random = random_start self._goal_distance = goal_distance high = np.array([2 * self._goal_distance, np.pi, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self.max_u]), high=np.array([self.max_u])) horizon = 1500 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization env_width = 4 * goal_distance env_height = 2.5 * 2 * self.l width = 800 height = int(width * env_height / env_width) self._viewer = Viewer(env_width, env_height, width, height) super(SegwayLinearMotion, self).__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi / 4, np.pi / 4) else: angle = -np.pi / 8 self._state = np.array([-self._goal_distance, angle, 0., 0.]) else: self._state = state self._state[1] = normalize_angle(self._state[1]) return self._state def step(self, action): u = np.maximum(-self.max_u, np.minimum(self.max_u, action[0])) new_state = odeint(self._dynamics, self._state, [0, self.dt], (u, )) self._state = np.array(new_state[-1]) self._state[1] = normalize_angle(self._state[1]) if abs(self._state[1]) > np.pi / 2 \ or abs(self._state[0]) > 2*self._goal_distance: absorbing = True reward = -10000 else: absorbing = False Q = np.diag([10.0, 3.0, 0.1, 0.1]) x = self._state J = x.dot(Q).dot(x) reward = -J return self._state, reward, absorbing, {} def _dynamics(self, state, t, u): position = state[0] alpha = state[1] d_alpha = state[2] d_beta = state[3] h1 = (self.Mr + self.Mp) * (self.r**2) + self.Ir h2 = self.Mp * self.r * self.l * np.cos(alpha) h3 = self.l**2 * self.Mp + self.Ip omegaP = d_alpha omegaR = d_beta velocity = -omegaR * self.r dOmegaP = -(h2*self.l*self.Mp*self.r*np.sin(alpha)*omegaP**2 - self.g*h1*self.l*self.Mp*np.sin(alpha) + (h2+h1)*u)\ / (h1*h3-h2**2) dOmegaR = (h3*self.l*self.Mp*self.r*np.sin(alpha)*omegaP**2 - self.g*h2*self.l*self.Mp*np.sin(alpha) + (h3+h2)*u)\ / (h1*h3-h2**2) dx = list() dx.append(velocity) dx.append(omegaP) dx.append(dOmegaP) dx.append(dOmegaR) return dx def render(self, mode='human'): start = np.array([2 * self._goal_distance, 1.25 * 2 * self.l]) end = np.array(start) goal = start + np.array([0, -self.r]) position = self._state[0] start[0] += position end[0] += -2 * self.l * np.sin(self._state[1]) + position end[1] += 2 * self.l * np.cos(self._state[1]) self._viewer.line(start, end) self._viewer.circle(start, self.r) self._viewer.circle(goal, radius=0.01, color=(255, 0, 0)) self._viewer.display(self.dt)
class AbstractGridWorld(Environment): """ Abstract class to build a grid world. """ def __init__(self, mdp_info, height, width, start, goal): """ Constructor. Args: height (int): height of the grid; width (int): width of the grid; start (tuple): x-y coordinates of the goal; goal (tuple): x-y coordinates of the goal. """ assert not np.array_equal(start, goal) assert goal[0] < height and goal[1] < width,\ 'Goal position not suitable for the grid world dimension.' self._state = None self._height = height self._width = width self._start = start self._goal = goal # Visualization self._viewer = Viewer(self._width, self._height, 500, self._height * 500 // self._width) super().__init__(mdp_info) def reset(self, state=None): if state is None: state = self.convert_to_int(self._start, self._width) self._state = state return self._state def step(self, action): state = self.convert_to_grid(self._state, self._width) new_state, reward, absorbing, info = self._step(state, action) self._state = self.convert_to_int(new_state, self._width) return self._state, reward, absorbing, info def render(self): for row in range(1, self._height): for col in range(1, self._width): self._viewer.line(np.array([col, 0]), np.array([col, self._height])) self._viewer.line(np.array([0, row]), np.array([self._width, row])) goal_center = np.array( [.5 + self._goal[1], self._height - (.5 + self._goal[0])]) self._viewer.square(goal_center, 0, 1, (0, 255, 0)) start_grid = self.convert_to_grid(self._start, self._width) start_center = np.array( [.5 + start_grid[1], self._height - (.5 + start_grid[0])]) self._viewer.square(start_center, 0, 1, (255, 0, 0)) state_grid = self.convert_to_grid(self._state, self._width) state_center = np.array( [.5 + state_grid[1], self._height - (.5 + state_grid[0])]) self._viewer.circle(state_center, .4, (0, 0, 255)) self._viewer.display(.1) def _step(self, state, action): raise NotImplementedError('AbstractGridWorld is an abstract class.') @staticmethod def convert_to_grid(state, width): return np.array([state[0] // width, state[0] % width]) @staticmethod def convert_to_int(state, width): return np.array([state[0] * width + state[1]])
def __init__(self): self._rotation_radius = 0.6 self._catch_radius = 0.4 self._v_prey = 0.13 self._v_predator = 0.1 self._dt = 0.1 self._omega_prey = self._v_prey / self._rotation_radius self._omega_predator = self._v_predator / self._rotation_radius self._max_x = 5.0 self._max_y = 5.0 self._obstacles = [ (np.array([self._max_x/5, self._max_y - 3.8*self._catch_radius]), np.array([self._max_x, self._max_y - 3.8*self._catch_radius])), (np.array([-3/5*self._max_x, self._max_y/4]), np.array([-3/5*self._max_x, -3/10*self._max_y])), (np.array([-3/5*self._max_x + 3.8*self._catch_radius, self._max_y / 4]), np.array([-3/5*self._max_x + 3.8*self._catch_radius, -3/10*self._max_y])), (np.array([-3/5*self._max_x, self._max_y/4]), np.array([-3/5*self._max_x + 3.8*self._catch_radius, self._max_y/4])) ] # Add bounds of the map self._obstacles += [(np.array([-self._max_x, -self._max_y]), np.array([-self._max_x, self._max_y])), (np.array([-self._max_x, -self._max_y]), np.array([self._max_x, -self._max_y])), (np.array([self._max_x, self._max_y]), np.array([-self._max_x, self._max_y])), (np.array([self._max_x, self._max_y]), np.array([self._max_x, -self._max_y])) ] high = np.array([self._max_x, self._max_y, np.pi, self._max_x, self._max_y, np.pi]) # MDP properties horizon = 500 gamma = 0.99 observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([0, -self._omega_predator]), high=np.array([self._v_predator, self._omega_predator])) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization width = 500 height = int(width * self._max_y / self._max_x) self._viewer = Viewer(2*self._max_x, 2*self._max_y, width, height) super(PreyPredator, self).__init__(mdp_info)
class PreyPredator(Environment): """ A prey-predator environment environment. A Predator must catch a faster prey in an environment with obstacles. """ def __init__(self): self._rotation_radius = 0.6 self._catch_radius = 0.4 self._v_prey = 0.13 self._v_predator = 0.1 self._dt = 0.1 self._omega_prey = self._v_prey / self._rotation_radius self._omega_predator = self._v_predator / self._rotation_radius self._max_x = 5.0 self._max_y = 5.0 self._obstacles = [ (np.array([self._max_x/5, self._max_y - 3.8*self._catch_radius]), np.array([self._max_x, self._max_y - 3.8*self._catch_radius])), (np.array([-3/5*self._max_x, self._max_y/4]), np.array([-3/5*self._max_x, -3/10*self._max_y])), (np.array([-3/5*self._max_x + 3.8*self._catch_radius, self._max_y / 4]), np.array([-3/5*self._max_x + 3.8*self._catch_radius, -3/10*self._max_y])), (np.array([-3/5*self._max_x, self._max_y/4]), np.array([-3/5*self._max_x + 3.8*self._catch_radius, self._max_y/4])) ] # Add bounds of the map self._obstacles += [(np.array([-self._max_x, -self._max_y]), np.array([-self._max_x, self._max_y])), (np.array([-self._max_x, -self._max_y]), np.array([self._max_x, -self._max_y])), (np.array([self._max_x, self._max_y]), np.array([-self._max_x, self._max_y])), (np.array([self._max_x, self._max_y]), np.array([self._max_x, -self._max_y])) ] high = np.array([self._max_x, self._max_y, np.pi, self._max_x, self._max_y, np.pi]) # MDP properties horizon = 500 gamma = 0.99 observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([0, -self._omega_predator]), high=np.array([self._v_predator, self._omega_predator])) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization width = 500 height = int(width * self._max_y / self._max_x) self._viewer = Viewer(2*self._max_x, 2*self._max_y, width, height) super(PreyPredator, self).__init__(mdp_info) def reset(self, state=None): if state is None: self._state = np.array([0., 0., 0., self._max_x/2, self._max_y/2, np.pi/2]) self._state = np.array([3., 1., np.pi/2, 3., 2., np.pi/2]) ok = False while not ok: self._state = np.random.uniform( low=self.info.observation_space.low, high=self.info.observation_space.high) delta_norm = np.linalg.norm(self._state[:2] - self._state[3:5]) ok = delta_norm > self._catch_radius else: self._state = state self._state[2] = normalize_angle(self._state[2]) self._state[5] = normalize_angle(self._state[5]) return self._state def step(self, action): # compute new predator state u = self._bound(action, self.info.action_space.low, self.info.action_space.high) state_predator = self._state[:3] state_predator = self._differential_drive_dynamics(state_predator, u) # Compute new prey state u_prey = self._prey_controller(self._state) state_prey = self._state[3:] state_prey = self._differential_drive_dynamics(state_prey, u_prey) # Update state self._state = np.concatenate([state_predator, state_prey], 0) delta_norm_new = np.linalg.norm(self._state[:2]-self._state[3:5]) if delta_norm_new < self._catch_radius: collision, _ = self._check_collision(self._state[:2], self._state[3:5]) if collision is None: absorbing = True else: absorbing = False else: absorbing = False reward = -delta_norm_new return self._state, reward, absorbing, {} def _prey_controller(self, state): delta_norm = np.linalg.norm(state[:2] - state[3:5]) if delta_norm > 3.0: velocity_prey = 0 elif delta_norm > 1.5: velocity_prey = self._v_prey / 2 else: velocity_prey = self._v_prey attack_angle = normalize_angle(np.arctan2(state[4] - state[1], state[3] - state[0])) angle_current = shortest_angular_distance(state[5], attack_angle) if velocity_prey > 0: # check attack angle collision cos_theta = np.cos(attack_angle) sin_theta = np.sin(attack_angle) increment = 2.5*self._rotation_radius*np.array([cos_theta, sin_theta]) collision, i = self._check_collision(state[3:5], state[3:5]+increment) if collision is not None: obstacle = self._obstacles[i] v_obst = self._segment_to_vector(*obstacle) v_attack = state[3:5] - state[0:2] angle = self._vector_angle(v_obst, v_attack) if 0 <= angle <= np.pi/2 or angle <= -np.pi/2: rotation_sign = +1 else: rotation_sign = -1 evasion_angle = attack_angle + rotation_sign * np.pi/2 angle_current = shortest_angular_distance(state[5], evasion_angle) alpha = normalize_angle(state[5] + rotation_sign*np.pi/2) cos_alpha = np.cos(alpha) sin_alpha = np.sin(alpha) increment = 1.5*self._rotation_radius * np.array( [cos_alpha, sin_alpha]) lateral_collision, _ = self._check_collision(state[3:5], state[3:5] + increment) if lateral_collision is not None: rotation_sign *= -1 angle_current = rotation_sign * np.pi omega_prey = angle_current / np.pi u_prey = np.empty(2) u_prey[0] = self._bound(velocity_prey, 0, self._v_prey) u_prey[1] = self._bound(omega_prey, -self._omega_prey, self._omega_prey) return u_prey @staticmethod def _cross_2d(vecr, vecs): return vecr[0] * vecs[1] - vecr[1] * vecs[0] @staticmethod def _segment_to_vector(*segment): a = segment[0] b = segment[1] if (b[0] > a[0]) or (b[0] == a[0] and b[1] > a[1]): return b - a else: return a - b @staticmethod def _vector_angle(x, y): angle = np.arctan2(x[1], x[0]) - np.arctan2(y[1], y[0]) return normalize_angle(angle) def _check_collision(self, start, end): collision = None index = None min_u = np.inf for i, obstacle in enumerate(self._obstacles): r = obstacle[1] - obstacle[0] s = end - start den = self._cross_2d(vecr=r, vecs=s) if den != 0: t = self._cross_2d((start - obstacle[0]), s) / den u = self._cross_2d((start - obstacle[0]), r) / den if 1 >= u >= 0 and 1 >= t >= 0: if u < min_u: collision = start + (u-1e-2)*s min_u = u index = i return collision, index def _differential_drive_dynamics(self, state, u): delta = np.empty(3) delta[0] = np.cos(state[2]) * u[0] delta[1] = np.sin(state[2]) * u[0] delta[2] = u[1] new_state = state + delta collision, _ = self._check_collision(state[:2], new_state[:2]) if collision is not None: new_state[:2] = collision new_state[0] = self._bound(new_state[0], -self._max_x, self._max_x) new_state[1] = self._bound(new_state[1], -self._max_y, self._max_y) new_state[2] = normalize_angle(new_state[2]) return new_state def render(self, mode='human'): center = np.array([self._max_x, self._max_y]) predator_pos = self._state[:2] predator_theta = self._state[2] prey_pos = self._state[3:5] prey_theta = self._state[5] # Predator self._viewer.circle(center + predator_pos, self._catch_radius, (255, 255, 255)) self._viewer.arrow_head(center + predator_pos, self._catch_radius, predator_theta, (255, 0, 0)) # Prey self._viewer.arrow_head(center + prey_pos, self._catch_radius, prey_theta, (0, 0, 255)) # Obstacles for obstacle in self._obstacles: start = obstacle[0] end = obstacle[1] self._viewer.line(center + start, center + end) self._viewer.display(self._dt)
class PuddleWorld(Environment): """ Puddle world as presented in: "Off-Policy Actor-Critic". Degris T. et al.. 2012. """ def __init__(self, start=None, goal=None, goal_threshold=.1, noise_step=.025, noise_reward=0, reward_goal=0., thrust=.05, puddle_center=None, puddle_width=None, gamma=.99, horizon=5000): """ Constructor. Args: start (np.array, None): starting position of the agent; goal (np.array, None): goal position; goal_threshold (float, .1): distance threshold of the agent from the goal to consider it reached; noise_step (float, .025): noise in actions; noise_reward (float, 0): standard deviation of gaussian noise in reward; reward_goal (float, 0): reward obtained reaching goal state; thrust (float, .05): distance walked during each action; puddle_center (np.array, None): center of the puddle; puddle_width (np.array, None): width of the puddle; """ # MDP parameters self._start = np.array([.2, .4]) if start is None else start self._goal = np.array([1., 1.]) if goal is None else goal self._goal_threshold = goal_threshold self._noise_step = noise_step self._noise_reward = noise_reward self._reward_goal = reward_goal self._thrust = thrust puddle_center = [[.3, .6], [.4, .5], [.8, .9] ] if puddle_center is None else puddle_center self._puddle_center = [np.array(center) for center in puddle_center] puddle_width = [[.1, .03], [.03, .1], [.03, .1] ] if puddle_width is None else puddle_width self._puddle_width = [np.array(width) for width in puddle_width] self._actions = [np.zeros(2) for _ in range(5)] for i in range(4): self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1) # MDP properties action_space = Discrete(5) observation_space = Box(0., 1., shape=(2, )) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._pixels = None self._viewer = Viewer(1.0, 1.0) super().__init__(mdp_info) def reset(self, state=None): if state is None: self._state = self._start.copy() else: self._state = state return self._state def step(self, action): idx = action[0] self._state += self._actions[idx] + np.random.uniform( low=-self._noise_step, high=self._noise_step, size=(2, )) self._state = np.clip(self._state, 0., 1.) absorbing = np.linalg.norm( (self._state - self._goal), ord=1) < self._goal_threshold if not absorbing: reward = np.random.randn() * self._noise_reward + self._get_reward( self._state) else: reward = self._reward_goal return self._state, reward, absorbing, {} def render(self): if self._pixels is None: img_size = 100 pixels = np.zeros((img_size, img_size, 3)) for i in range(img_size): for j in range(img_size): x = i / img_size y = j / img_size pixels[i, img_size - 1 - j] = self._get_reward( np.array([x, y])) pixels -= pixels.min() pixels *= 255. / pixels.max() self._pixels = np.floor(255 - pixels) self._viewer.background_image(self._pixels) self._viewer.circle(self._state, 0.01, color=(0, 255, 0)) goal_area = [[-self._goal_threshold, 0], [0, self._goal_threshold], [self._goal_threshold, 0], [0, -self._goal_threshold]] self._viewer.polygon(self._goal, 0, goal_area, color=(255, 0, 0), width=1) self._viewer.display(0.1) def stop(self): if self._viewer is not None: self._viewer.close() def _get_reward(self, state): reward = -1. for cen, wid in zip(self._puddle_center, self._puddle_width): reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf( state[1], cen[1], wid[1]) return reward
class InvertedPendulumDiscrete(Environment): """ The Inverted Pendulum environment as presented in: "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003. """ def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info) def reset(self, state=None): if state is None: angle = np.random.uniform(-np.pi / 8., np.pi / 8.) self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) return self._state def step(self, action): if action == 0: u = -self._max_u elif action == 1: u = 0. else: u = self._max_u u += np.random.uniform(-self._noise_u, self._noise_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u,)) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if np.abs(self._state[0]) > np.pi * .5: reward = -1. absorbing = True else: reward = 0. absorbing = False self._last_u = u return self._state, reward, absorbing, {} def render(self, mode='human'): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.circle(start, self._l / 40) self._viewer.circle(end, self._l / 20) self._viewer.torque_arrow(start, -self._last_u, self._max_u, self._l / 5) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _dynamics(self, state, t, u): theta = state[0] omega = state[1] d_theta = omega d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l * d_theta ** 2 * np.sin(2 * theta) * .5 - self._alpha * np.cos( theta) * u) / (4 / 3 * self._l - self._alpha * self._m * self._l * np.cos(theta) ** 2) return d_theta, d_omega
class InvertedPendulum(Environment): """ The Inverted Pendulum environment (continuous version) as presented in: "Reinforcement Learning In Continuous Time and Space". Doya K.. 2000. "Off-Policy Actor-Critic". Degris T. et al.. 2012. "Deterministic Policy Gradient Algorithms". Silver D. et al. 2014. """ def __init__(self, random_start=False, m=1.0, l=1.0, g=9.8, mu=1e-2, max_u=5.0, horizon=5000): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one; m (float, 1.0): mass of the pendulum; l (float, 1.0): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 5.0): maximum allowed input torque; horizon (int, 5000): horizon of the problem. """ # MDP parameters self._m = m self._l = l self._g = g self._mu = mu self._random = random_start self._dt = 0.01 self._max_u = max_u self._max_omega = 5 / 2 * np.pi high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info) def reset(self, state=None): if state is None: if self._random: angle = np.random.uniform(-np.pi, np.pi) else: angle = np.pi / 2 self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega) return self._state def step(self, action): u = self._bound(action[0], -self._max_u, self._max_u) new_state = odeint(self._dynamics, self._state, [0, self._dt], (u,)) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega) reward = np.cos(self._state[0]) self._last_u = u return self._state, reward, False, {} def render(self, mode='human'): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.circle(start, self._l / 40) self._viewer.circle(end, self._l / 20) self._viewer.torque_arrow(start, -self._last_u, self._max_u, self._l / 5) self._viewer.display(self._dt) def stop(self): self._viewer.close() def _dynamics(self, state, t, u): theta = state[0] omega = self._bound(state[1], -self._max_omega, self._max_omega) d_theta = omega d_omega = (-self._mu * omega + self._m * self._g * self._l * np.sin( theta) + u) / (self._m * self._l**2) return d_theta, d_omega
class ShipSteeringMultiGate(Environment): """ The Ship Steering environment as presented in: "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S.. 2013 with multiple gates. """ def __init__(self, n_steps_action=3, viz_speed=100, small=False): self.__name__ = 'ShipSteeringMultiGate' self.n_steps_action = n_steps_action self.viz_speed = viz_speed # MDP parameters self.no_of_gates = 4 self.small = small self.field_size = 500 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12., 0]) high = np.array([ self.field_size, self.field_size, np.pi, np.pi / 12., self.no_of_gates ]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 gate_1s = np.array([75, 175]) if small else np.array([150, 350]) gate_1e = np.array([125, 175]) if small else np.array([250, 350]) gate_1 = np.array([gate_1s, gate_1e]) gate_2s = np.array([150, 300]) if small else np.array([300, 600]) gate_2e = np.array([200, 300]) if small else np.array([400, 600]) gate_2 = np.array([gate_2s, gate_2e]) gate_3s = np.array([250, 350]) if small else np.array([500, 700]) gate_3e = np.array([300, 350]) if small else np.array([600, 700]) gate_3 = np.array([gate_3s, gate_3e]) gate_4s = np.array([150, 425]) if small else np.array([300, 850]) gate_4e = np.array([200, 425]) if small else np.array([400, 850]) gate_4 = np.array([gate_4s, gate_4e]) self._gate_list = gate_1, gate_2, gate_3, gate_4 # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 self._out_reward = -10000 self.correct_order = False mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteeringMultiGate, self).__init__(mdp_info) def reset(self, state=None): if state is None: self._state = np.zeros(8) #self._state[0] = 500 #self._state[1] = 500 else: self._state = state return self._state def step(self, action): r = np.maximum(-self.omega_max, np.minimum(self.omega_max, action[0])) new_state = self._state for _ in range(self.n_steps_action): state = new_state new_state = np.empty(8) new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt new_state[2] = normalize_angle(state[2] + state[3] * self._dt) new_state[3] = state[3] + (r - state[3]) * self._dt / self._T new_state[4:] = state[4:] absorbing = False reward = 0 if new_state[0] > self.field_size \ or new_state[1] > self.field_size \ or new_state[0] < 0 or new_state[1] < 0: reward = self._out_reward absorbing = True break else: for i, gate in enumerate(self._gate_list): if self._through_gate(state[:2], new_state[:2], gate): new_state[4 + i] += 1 if new_state[4 + i] == 1: reward = 10 if np.all(new_state[5:] > 0): absorbing = True break self._state = new_state return self._state, reward, absorbing, {} def _through_gate(self, start, end, gate): gate_e = gate[1] gate_s = gate[0] r = gate_e - gate_s s = end - start den = self._cross_2d(vecr=r, vecs=s) if den == 0: return False t = self._cross_2d((start - gate_s), s) / den u = self._cross_2d((start - gate_s), r) / den return 1 >= u >= 0 and 1 >= t >= 0 @staticmethod def _cross_2d(vecr, vecs): return vecr[0] * vecs[1] - vecr[1] * vecs[0] def render(self, mode='human'): for gate in self._gate_list: gate_s = gate[0] gate_e = gate[1] self._viewer.line(gate_s, gate_e, width=3) boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]] self._viewer.polygon(self._state[:2], self._state[2], boat, color=(250, 55, 54)) self._viewer.display(self._dt / self.viz_speed)