Example #1
0
    def __init__(self):
        super(PredictActionsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple(
            (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1))
    def __init__(self):
        super(PredictObsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space, ) +
                                         (self.cartpole.observation_space, ) *
                                         (NUM_PREDICTED_OBSERVATIONS))
class PredictObsCartpoleEnv(Env):
    def __init__(self):
        super(PredictObsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space, ) +
                                         (self.cartpole.observation_space, ) *
                                         (NUM_PREDICTED_OBSERVATIONS))

    def _seed(self, *n, **kw):
        return self.cartpole._seed(*n, **kw)

    def _render(self, *n, **kw):
        return self.cartpole._render(*n, **kw)

    def _configure(self, *n, **kw):
        return self.cartpole._configure(*n, **kw)

    def _step(self, action):
        # the first element of action is the actual current action
        current_action = action[0]

        observation, reward, done, info = self.cartpole._step(current_action)

        if not done:
            # We add the newly predicted observations to the list before checking predictions
            # in order to give the agent a chance to predict the observations that they
            # are going to get _this_ round.
            self.predicted_observations.append(action[1:])

            if self.iteration > TIME_BEFORE_BONUS_ALLOWED:
                for i in range(
                        min(NUM_PREDICTED_OBSERVATIONS,
                            len(self.predicted_observations))):
                    l2dist = np.sqrt(
                        np.sum(
                            np.square(
                                np.subtract(
                                    self.predicted_observations[-(i + 1)][i],
                                    observation))))

                    bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist))

                    reward += bonus

            self.iteration += 1

        return observation, reward, done, info

    def _reset(self):
        observation = self.cartpole._reset()
        self.predicted_observations = []
        self.iteration = 0
        return observation
Example #4
0
class PredictObsCartpoleEnv(Env):
    def __init__(self):
        super(PredictObsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space,) + (self.cartpole.observation_space,) * (NUM_PREDICTED_OBSERVATIONS))

    def _seed(self, *n, **kw):
        return self.cartpole._seed(*n, **kw)

    def _render(self, *n, **kw):
        return self.cartpole._render(*n, **kw)

    def _configure(self, *n, **kw):
        return self.cartpole._configure(*n, **kw)

    def _step(self, action):
        # the first element of action is the actual current action
        current_action = action[0]

        observation, reward, done, info = self.cartpole._step(current_action)

        if not done:
            # We add the newly predicted observations to the list before checking predictions
            # in order to give the agent a chance to predict the observations that they
            # are going to get _this_ round.
            self.predicted_observations.append(action[1:])

            if self.iteration > TIME_BEFORE_BONUS_ALLOWED:
                for i in xrange(min(NUM_PREDICTED_OBSERVATIONS, len(self.predicted_observations))):
                    l2dist = np.sqrt(np.sum(np.square(np.subtract(
                        self.predicted_observations[-(i + 1)][i],
                        observation
                    ))))

                    bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist))

                    reward += bonus

            self.iteration += 1

        return observation, reward, done, info

    def _reset(self):
        observation = self.cartpole._reset()
        self.predicted_observations = []
        self.iteration = 0
        return observation
Example #5
0
 def __init__(self, max_episode_length=500, random_stable_position=False):
     CartPoleEnv.__init__(self)
     self.action_high = np.asarray([self.force_mag])
     self.action_space = spaces.Box(-self.action_high, self.action_high)
     self._max_episode_length = max_episode_length
     self._time_step = 0
     self._stable_x = None
     if random_stable_position:
         self._rand_pos_max = self.x_threshold - 0.4
         self._stable_x = np.random.uniform(-self._rand_pos_max,
                                            self._rand_pos_max)
         # log.info("obs high : {}".format(self.observation_space.high))
         oh = np.hstack((self.observation_space.high,
                         np.asarray([self._rand_pos_max])))
         self.observation_space = spaces.Box(-oh, oh)
     log.debug("Action Space {}".format(self.action_space))
     log.debug("Observations Space {}".format(self.observation_space))
Example #6
0
class PredictActionsCartpoleEnv(Env):
    def __init__(self):
        super(PredictActionsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple(
            (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1))

    def _seed(self, *n, **kw):
        return self.cartpole._seed(*n, **kw)

    def _render(self, *n, **kw):
        return self.cartpole._render(*n, **kw)

    def _configure(self, *n, **kw):
        return self.cartpole._configure(*n, **kw)

    def _step(self, action):
        # the first element of action is the actual current action
        current_action = action[0]

        observation, reward, done, info = self.cartpole._step(current_action)

        if not done:
            if self.iteration > TIME_BEFORE_BONUS_ALLOWED:
                for i in xrange(
                        min(NUM_PREDICTED_ACTIONS,
                            len(self.predicted_actions))):
                    if self.predicted_actions[-(i + 1)][i] == current_action:
                        reward += CORRECT_PREDICTION_BONUS

            self.predicted_actions.append(action[1:])

            self.iteration += 1

        return observation, reward, done, info

    def _reset(self):
        observation = self.cartpole._reset()
        self.predicted_actions = []
        self.iteration = 0
        return observation
Example #7
0
class PredictActionsCartpoleEnv(Env):
    def __init__(self):
        super(PredictActionsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space,) * (NUM_PREDICTED_ACTIONS+1))

    def _seed(self, *n, **kw):
        return self.cartpole._seed(*n, **kw)

    def _render(self, *n, **kw):
        return self.cartpole._render(*n, **kw)

    def _configure(self, *n, **kw):
        return self.cartpole._configure(*n, **kw)

    def _step(self, action):
        # the first element of action is the actual current action
        current_action = action[0]

        observation, reward, done, info = self.cartpole._step(current_action)

        if not done:
            if self.iteration > TIME_BEFORE_BONUS_ALLOWED:
                for i in xrange(min(NUM_PREDICTED_ACTIONS, len(self.predicted_actions))):
                    if self.predicted_actions[-(i + 1)][i] == current_action:
                        reward += CORRECT_PREDICTION_BONUS

            self.predicted_actions.append(action[1:])

            self.iteration += 1

        return observation, reward, done, info

    def _reset(self):
        observation = self.cartpole._reset()
        self.predicted_actions = []
        self.iteration = 0
        return observation
Example #8
0
 def reset(self):
     obs = CartPoleEnv.reset(self)
     self.steps_beyond_done = 0
     self.success_steps = 0
     return obs
Example #9
0
 def __init__(self):
     CartPoleEnv.__init__(self)
     self.steps_beyond_done = 0
     self.success_steps = 0
Example #10
0
    def __init__(self):
        super(PredictActionsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space,) * (NUM_PREDICTED_ACTIONS+1))
Example #11
0
import gym
import numpy as np
import math
from gym.envs.classic_control.cartpole import CartPoleEnv

from envs.task import Task

_env = CartPoleEnv()
_X_THRESHOLD = _env.x_threshold
_THETA_THRESHOLD = _env.theta_threshold_radians
del _env


class CartPoleBalanceTask(Task):
    def __call__(self, states, actions, next_states):
        next_dones = GYMMB_ContinuousCartPole.is_done(next_states)
        dones = GYMMB_ContinuousCartPole.is_done(states)
        rewards = 1.0 - next_dones.float() * dones.float()  # basically, you get always 1.0 unless you exceed the is_done termination criteria
        return rewards


class CartPoleSpeedyBalanceTask(Task):
    def __call__(self, states, actions, next_states):
        rewards = states[:, 1].abs()  # more rewards for abs velocity. No free points
        return rewards


class GYMMB_ContinuousCartPole(CartPoleEnv):
    """
    A continuous version.
    Observation space:
Example #12
0
def cartpole_env(env_id=1, **kwargs):
    return GymEnvWrapper(CartPoleEnv(**kwargs), act_null_value=0)
Example #13
0
        logging.debug("actions: %s\n%s", action_tensor.shape, action_tensor)
        logging.debug("log_prob: %s\n%s", log_prob_tensor.shape,
                      log_prob_tensor)
        logging.debug("log_prob test: %s\n%s", prob_tensor.shape,
                      torch.log(prob_tensor))
        logging.debug("rewards: %s\n%s", rewards_tensor.shape, rewards_tensor)
        logging.debug("loss: %s\n%s", loss.shape, loss)


if __name__ == "__main__":

    #Create environment. Note that make actually wraps the actual environment.
    #The wrapper will end an episode based on a time limit
    # env = gym.make("CartPole-v0") #state = [x,x_dot,theta,theta_dot], actions = [left_or_right] 0 = left 1 = right

    env = CartPoleEnv()  #instanciate the env directly, without the wrapper

    #get the state and action size from the environement
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    #instanciate agent
    agent = PolicyGradientAgent(state_size, action_size)

    done = True
    last_render_time = time.time()
    episode_count = -1
    #forever
    while True:

        #if episode is done
Example #14
0
    def __init__(self):
        super(PredictObsCartpoleEnv, self).__init__()
        self.cartpole = CartPoleEnv()

        self.observation_space = self.cartpole.observation_space
        self.action_space = spaces.Tuple((self.cartpole.action_space,) + (self.cartpole.observation_space,) * (NUM_PREDICTED_OBSERVATIONS))