def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple( (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1))
def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space, ) + (self.cartpole.observation_space, ) * (NUM_PREDICTED_OBSERVATIONS))
class PredictObsCartpoleEnv(Env): def __init__(self): super(PredictObsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple((self.cartpole.action_space, ) + (self.cartpole.observation_space, ) * (NUM_PREDICTED_OBSERVATIONS)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: # We add the newly predicted observations to the list before checking predictions # in order to give the agent a chance to predict the observations that they # are going to get _this_ round. self.predicted_observations.append(action[1:]) if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in xrange( min(NUM_PREDICTED_OBSERVATIONS, len(self.predicted_observations))): l2dist = np.sqrt( np.sum( np.square( np.subtract( self.predicted_observations[-(i + 1)][i], observation)))) bonus = CORRECT_PREDICTION_BONUS * (1 - math.erf(l2dist)) reward += bonus self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_observations = [] self.iteration = 0 return observation
class PredictActionsCartpoleEnv(Env): def __init__(self): super(PredictActionsCartpoleEnv, self).__init__() self.cartpole = CartPoleEnv() self.observation_space = self.cartpole.observation_space self.action_space = spaces.Tuple( (self.cartpole.action_space, ) * (NUM_PREDICTED_ACTIONS + 1)) def _seed(self, *n, **kw): return self.cartpole._seed(*n, **kw) def _render(self, *n, **kw): return self.cartpole._render(*n, **kw) def _configure(self, *n, **kw): return self.cartpole._configure(*n, **kw) def _step(self, action): # the first element of action is the actual current action current_action = action[0] observation, reward, done, info = self.cartpole._step(current_action) if not done: if self.iteration > TIME_BEFORE_BONUS_ALLOWED: for i in xrange( min(NUM_PREDICTED_ACTIONS, len(self.predicted_actions))): if self.predicted_actions[-(i + 1)][i] == current_action: reward += CORRECT_PREDICTION_BONUS self.predicted_actions.append(action[1:]) self.iteration += 1 return observation, reward, done, info def _reset(self): observation = self.cartpole._reset() self.predicted_actions = [] self.iteration = 0 return observation