Exemple #1
0
    def _q_learning_training(self,
                             sensation,
                             action,
                             reward,
                             next_sensation,
                             next_action=None):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]

        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)

        self.verbose(
            "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
            % (rho, last_value, value, value - last_value, delta,
               is_terminal(next_sensation)))

        self.update_Q(sensation,
                      action,
                      delta,
                      on_policy=(last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation, action) - last_value) / delta < 1.0
Exemple #2
0
 def __call__(self, sensation, reward=None):
     if not is_terminal(sensation):
         sensation = tile_uniform_rfs(
             array(sensation) / self.tile_width, self.num_rfs,
             self.rf_width, self.num_tilings,
             self.num_features / self.num_rfs)
     return super(UniformTiledAgent, self).__call__(sensation, reward)
Exemple #3
0
 def __call__(self, sensation, reward=None):
     if is_terminal(sensation):
         new_sensation = sensation
     else:
         new_sensation = zeros(self.num_features, 'f')
         for f in sensation:
             new_sensation[f] = 1
     return super(LinearListAgent, self).__call__(new_sensation, reward)
Exemple #4
0
 def __call__(self, sensation, reward=None):
     if not is_terminal(sensation):
         assert (type(sensation) == int)
         s = zeros(self.num_features)
         s[sensation] = 1.0
     else:
         s = sensation
     return super(LinearTabularTDAgent, self).__call__(s, reward)
Exemple #5
0
 def __call__(self,sensation,reward=None):
     if reward == None:
         self.setup_gen()
     if not rl.is_terminal(sensation):
         try:
             return self.gen.next()
         except StopIteration:
             return OPTION_TERMINATED
Exemple #6
0
    def _sarsa_training(self, sensation, action, reward, next_sensation,
                        next_action):
        """
        Perform a single SARSA training step given (s,a,r,s',a'). 
        """
        rho = self.rho(reward)

        if is_terminal(next_sensation):
            value = 0
        else:
            value = self.Q(next_sensation, next_action)

        last_value = self.Q(sensation, action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f" %
                     (self.total_steps, rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," +
                      "delta = %.5f, terminal? = %d") %
                     (last_value, value, value - last_value, delta,
                      is_terminal(next_sensation)))

        self.update_Q(sensation, action, delta)
Exemple #7
0
    def policy(self, sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation, actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0
Exemple #8
0
    def __call__(self, sensation, reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        training_fn = getattr(self, '_' + self.step_method + '_training')

        action_index = self.learning_step(training_fn, sensation, reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write( ` sensation ` + '\n')
            self._history_file.write( ` reward ` + '\n')
            if not is_terminal(sensation):
                self._history_file.write( ` action_index ` + '\n')
        return self.actions[action_index]
Exemple #9
0
    def __call__(self, sensation, reward=None):
        if reward is None:
            self._memory = [sensation]
        else:
            self._memory.append(sensation)

        if is_terminal(sensation):
            return super(TabularMemoryTDAgent,
                         self).__call__(sensation, reward)
        else:
            action = super(TabularMemoryTDAgent,
                           self).__call__(tuple(self._memory), reward)
            assert self.actions[self.last_action] == action
            self._memory.append(self.last_action)

            if len(self._memory) > (2 * self.memory_steps + 1):
                del self._memory[0:2]

            return action
Exemple #10
0
    def __call__(self,sensation,reward=None):
        if reward == None:
            self.stack = Stack([])
            self.push_option(self.root_option)
    
        self.last_sensation = sensation
        self.last_reward = reward

        if rl.is_terminal(sensation):
            # unwind the stack giving everyone the current reward
            # TODO: when options get their own separate rewards, this may change
            while not self.stack.empty():
                option,reward_list = self.stack.pop()
                option(sensation,reward_list+[option.reward(sensation,reward)])
            return None
        else:
            for option,rewards in self.stack[:-1]:
                rewards.append(option.reward(sensation,reward))
            option,rewards = self.stack.top()
            return  self.haction(option(sensation,option.reward(sensation,reward)))
Exemple #11
0
 def __call__(self,sensation,reward=None):
     result = self.agent(sensation,reward)
     if rl.is_terminal(sensation):
         result = OPTION_TERMINATED
     return result