Beispiel #1
0
class CarOnHill(Environment):
    """
    The Car On Hill environment as presented in:
    "Tree-Based Batch Mode Reinforcement Learning". Ernst D. et al.. 2005.

    """
    def __init__(self, horizon=100, gamma=.95):
        """
        Constructor.

        Args:
            horizon (int, 100): horizon of the problem;
            gamma (float, .95): discount factor.

        """
        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1.
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(1, 1)

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            self._state = np.array([-0.5, 0])
        else:
            self._state = state

        return self._state

    def step(self, action):
        action = self._discrete_actions[action[0]]
        sa = np.append(self._state, action)
        new_state = odeint(self._dpds, sa, [0, self._dt])

        self._state = new_state[-1, :-1]

        if self._state[0] < -self.max_pos or \
                np.abs(self._state[1]) > self.max_velocity:
            reward = -1
            absorbing = True
        elif self._state[0] > self.max_pos and \
                np.abs(self._state[1]) <= self.max_velocity:
            reward = 1
            absorbing = True
        else:
            reward = 0
            absorbing = False

        return self._state, reward, absorbing, {}

    def render(self):
        # Slope
        self._viewer.function(0, 1, self._height)

        # Car
        car_body = [
            [-3e-2, 0],
            [-3e-2, 2e-2],
            [-2e-2, 2e-2],
            [-1e-2, 3e-2],
            [1e-2, 3e-2],
            [2e-2, 2e-2],
            [3e-2, 2e-2],
            [3e-2, 0]
        ]

        x_car = (self._state[0] + 1) / 2
        y_car = self._height(x_car)
        c_car = [x_car, y_car]
        angle = self._angle(x_car)
        self._viewer.polygon(c_car, angle, car_body, color=(32, 193, 54))

        self._viewer.display(self._dt)

    @staticmethod
    def _angle(x):
        if x < 0.5:
            m = 4 * x - 1
        else:
            m = 1 / ((20 * x**2 - 20 * x + 6) ** 1.5)

        return np.arctan(m)

    @staticmethod
    def _height(x):
        y_neg = 4 * x**2 - 2 * x
        y_pos = (2 * x - 1) / np.sqrt(5 * (2 * x - 1)**2 + 1)
        y = np.zeros_like(x)

        mask = x < .5
        neg_mask = np.logical_not(mask)
        y[mask] = y_neg[mask]
        y[neg_mask] = y_pos[neg_mask]

        y_norm = (y + 1) / 2

        return y_norm

    def _dpds(self, state_action, t):
        position = state_action[0]
        velocity = state_action[1]
        u = state_action[-1]

        if position < 0.:
            diff_hill = 2 * position + 1
            diff_2_hill = 2
        else:
            diff_hill = 1 / ((1 + 5 * position ** 2) ** 1.5)
            diff_2_hill = (-15 * position) / ((1 + 5 * position ** 2) ** 2.5)

        dp = velocity
        ds = (u - self._g * self._m * diff_hill - velocity ** 2 * self._m *
              diff_hill * diff_2_hill) / (self._m * (1 + diff_hill ** 2))

        return dp, ds, 0.
Beispiel #2
0
class ShipSteering(Environment):
    """
    The Ship Steering environment as presented in:
    "Hierarchical Policy Gradient Algorithms". Ghavamzadeh M. and Mahadevan S..
    2013.

    """
    def __init__(self, small=True, n_steps_action=3):
        """
        Constructor.

        Args:
             small (bool, True): whether to use a small state space or not.
             n_steps_action (int, 3): number of integration intervals for each
                                      step of the mdp.

        """
        # MDP parameters
        self.field_size = 150 if small else 1000
        low = np.array([0, 0, -np.pi, -np.pi / 12.])
        high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.])
        self.omega_max = np.array([np.pi / 12.])
        self._v = 3.
        self._T = 5.
        self._dt = .2
        self._gate_s = np.empty(2)
        self._gate_e = np.empty(2)
        self._gate_s[0] = 100 if small else 350
        self._gate_s[1] = 120 if small else 400
        self._gate_e[0] = 120 if small else 450
        self._gate_e[1] = 100 if small else 400
        self._out_reward = -100
        self._success_reward = 0
        self._small = small
        self._state = None
        self.n_steps_action = n_steps_action

        # MDP properties
        observation_space = spaces.Box(low=low, high=high)
        action_space = spaces.Box(low=-self.omega_max, high=self.omega_max)
        horizon = 5000
        gamma = .99
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(self.field_size,
                              self.field_size,
                              background=(66, 131, 237))

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            if self._small:
                self._state = np.zeros(4)
                self._state[2] = np.pi / 2
            else:
                low = self.info.observation_space.low
                high = self.info.observation_space.high
                self._state = (high - low) * np.random.rand(4) + low
        else:
            self._state = state

        return self._state

    def step(self, action):

        r = self._bound(action[0], -self.omega_max, self.omega_max)

        new_state = self._state

        for _ in range(self.n_steps_action):
            state = new_state
            new_state = np.empty(4)
            new_state[0] = state[0] + self._v * np.cos(state[2]) * self._dt
            new_state[1] = state[1] + self._v * np.sin(state[2]) * self._dt
            new_state[2] = normalize_angle(state[2] + state[3] * self._dt)
            new_state[3] = state[3] + (r - state[3]) * self._dt / self._T

            if new_state[0] > self.field_size \
               or new_state[1] > self.field_size \
               or new_state[0] < 0 or new_state[1] < 0:

                new_state[0] = self._bound(new_state[0], 0, self.field_size)
                new_state[1] = self._bound(new_state[1], 0, self.field_size)

                reward = self._out_reward
                absorbing = True
                break

            elif self._through_gate(state[:2], new_state[:2]):
                reward = self._success_reward
                absorbing = True
                break
            else:
                reward = -1
                absorbing = False

        self._state = new_state

        return self._state, reward, absorbing, {}

    def render(self, mode='human'):
        self._viewer.line(self._gate_s, self._gate_e, width=3)

        boat = [[-4, -4], [-4, 4], [4, 4], [8, 0.0], [4, -4]]
        self._viewer.polygon(self._state[:2],
                             self._state[2],
                             boat,
                             color=(32, 193, 54))

        self._viewer.display(self._dt)

    def stop(self):
        self._viewer.close()

    def _through_gate(self, start, end):
        r = self._gate_e - self._gate_s
        s = end - start
        den = self._cross_2d(vecr=r, vecs=s)

        if den == 0:
            return False

        t = self._cross_2d((start - self._gate_s), s) / den
        u = self._cross_2d((start - self._gate_s), r) / den

        return 1 >= u >= 0 and 1 >= t >= 0

    @staticmethod
    def _cross_2d(vecr, vecs):
        return vecr[0] * vecs[1] - vecr[1] * vecs[0]
Beispiel #3
0
class PuddleWorld(Environment):
    """
    Puddle world as presented in:
    "Off-Policy Actor-Critic". Degris T. et al.. 2012.

    """
    def __init__(self, start=None, goal=None, goal_threshold=.1, noise_step=.025,
                 noise_reward=0, reward_goal=0., thrust=.05, puddle_center=None,
                 puddle_width=None, gamma=.99, horizon=5000):
        """
        Constructor.

        Args:
            start (np.array, None): starting position of the agent;
            goal (np.array, None): goal position;
            goal_threshold (float, .1): distance threshold of the agent from the
                goal to consider it reached;
            noise_step (float, .025): noise in actions;
            noise_reward (float, 0): standard deviation of gaussian noise in reward;
            reward_goal (float, 0): reward obtained reaching goal state;
            thrust (float, .05): distance walked during each action;
            puddle_center (np.array, None): center of the puddle;
            puddle_width (np.array, None): width of the puddle;

        """
        # MDP parameters
        self._start = np.array([.2, .4]) if start is None else start
        self._goal = np.array([1., 1.]) if goal is None else goal
        self._goal_threshold = goal_threshold
        self._noise_step = noise_step
        self._noise_reward = noise_reward
        self._reward_goal = reward_goal
        self._thrust = thrust
        puddle_center = [[.3, .6], [.4, .5], [.8, .9]] if puddle_center is None else puddle_center
        self._puddle_center = [np.array(center) for center in puddle_center]
        puddle_width = [[.1, .03], [.03, .1], [.03, .1]] if puddle_width is None else puddle_width
        self._puddle_width = [np.array(width) for width in puddle_width]

        self._actions = [np.zeros(2) for _ in range(5)]
        for i in range(4):
            self._actions[i][i // 2] = thrust * (i % 2 * 2 - 1)

        # MDP properties
        action_space = Discrete(5)
        observation_space = Box(0., 1., shape=(2,))
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._pixels = None
        self._viewer = Viewer(1.0, 1.0)

        super().__init__(mdp_info)

    def reset(self, state=None):
        if state is None:
            self._state = self._start.copy()
        else:
            self._state = state

        return self._state

    def step(self, action):
        idx = action[0]
        self._state += self._actions[idx] + np.random.uniform(
            low=-self._noise_step, high=self._noise_step, size=(2,))
        self._state = np.clip(self._state, 0., 1.)

        absorbing = np.linalg.norm((self._state - self._goal),
                                   ord=1) < self._goal_threshold

        if not absorbing:
            reward = np.random.randn() * self._noise_reward + self._get_reward(
                self._state)
        else:
            reward = self._reward_goal

        return self._state, reward, absorbing, {}

    def render(self):
        if self._pixels is None:
            img_size = 100
            pixels = np.zeros((img_size, img_size, 3))
            for i in range(img_size):
                for j in range(img_size):
                    x = i / img_size
                    y = j / img_size
                    pixels[i, img_size - 1 - j] = self._get_reward(
                        np.array([x, y]))

            pixels -= pixels.min()
            pixels *= 255. / pixels.max()
            self._pixels = np.floor(255 - pixels)

        self._viewer.background_image(self._pixels)
        self._viewer.circle(self._state, 0.01,
                            color=(0, 255, 0))

        goal_area = [
            [-self._goal_threshold, 0],
            [0, self._goal_threshold],
            [self._goal_threshold, 0],
            [0, -self._goal_threshold]
        ]
        self._viewer.polygon(self._goal, 0, goal_area,
                             color=(255, 0, 0), width=1)

        self._viewer.display(0.1)

    def stop(self):
        if self._viewer is not None:
            self._viewer.close()

    def _get_reward(self, state):
        reward = -1.
        for cen, wid in zip(self._puddle_center, self._puddle_width):
            reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf(
                state[1], cen[1], wid[1])

        return reward