def _q_learning_training(self, sensation, action, reward, next_sensation):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]

        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)

        self.verbose(
            "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
            % (rho, last_value, value, value - last_value, delta,
               is_terminal(next_sensation)))

        self.update_Q(sensation,
                      action,
                      delta,
                      on_policy=(last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation, action) - last_value) / delta < 1.0
Esempio n. 2
0
    def _q_learning_training(self,sensation,action,reward,next_sensation):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]
        
        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)
        
        self.verbose("r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
                      % (rho,last_value,value,value-last_value,delta,is_terminal(next_sensation)))

        self.update_Q(sensation,action,delta,on_policy = (last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation,action) - last_value)/delta < 1.0
 def __call__(self, sensation, reward=None):
     if not is_terminal(sensation):
         sensation = tile_uniform_rfs(
             array(sensation) / self.tile_width, self.num_rfs,
             self.rf_width, self.num_tilings,
             self.num_features / self.num_rfs)
     return super(UniformTiledAgent, self).__call__(sensation, reward)
 def __call__(self,sensation,reward=None):
     if reward == None:
         self.setup_gen()
     if not rl.is_terminal(sensation):
         try:
             return self.gen.next()
         except StopIteration:
             return OPTION_TERMINATED
Esempio n. 5
0
 def __call__(self, sensation, reward=None):
     if reward == None:
         self.setup_gen()
     if not rl.is_terminal(sensation):
         try:
             return self.gen.next()
         except StopIteration:
             return OPTION_TERMINATED
Esempio n. 6
0
 def __call__(self,sensation,reward=None):
     if not is_terminal(sensation):
         assert(type(sensation) == int)
         s = zeros(self.num_features)
         s[sensation] = 1.0
     else:
         s = sensation
     return super(LinearTabularTDAgent,self).__call__(s,reward)
 def __call__(self, sensation, reward=None):
     if is_terminal(sensation):
         new_sensation = sensation
     else:
         new_sensation = zeros(self.num_features, 'f')
         for f in sensation:
             new_sensation[f] = 1
     return super(LinearListAgent, self).__call__(new_sensation, reward)
Esempio n. 8
0
 def __call__(self,sensation,reward=None):
     if not is_terminal(sensation):
         sensation = tile_uniform_rfs(array(sensation)/self.tile_width,
                                      self.num_rfs,
                                      self.rf_width,
                                      self.num_tilings,
                                      self.num_features/self.num_rfs)
     return super(UniformTiledAgent,self).__call__(sensation,reward)
 def __call__(self, sensation, reward=None):
     if not is_terminal(sensation):
         assert (type(sensation) == int)
         s = zeros(self.num_features)
         s[sensation] = 1.0
     else:
         s = sensation
     return super(LinearTabularTDAgent, self).__call__(s, reward)
Esempio n. 10
0
 def __call__(self,sensation,reward=None):
     if is_terminal(sensation):
         new_sensation = sensation
     else:
         new_sensation = zeros(self.num_features,'f')
         for f in sensation:
             new_sensation[f] = 1
     return super(LinearListAgent,self).__call__(new_sensation,reward)
Esempio n. 11
0
    def sarsa_step(self,sensation,reward=None):
        """
        Do a step using the SARSA update method.  Selects an action,
        computes the TD update and calls self.update_Q.  Returns the
        agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        rho = self.rho(reward)
        next_action = self.policy(sensation)

        if is_terminal(sensation):
            value = 0
        else:
            value = self.Q(sensation,next_action)

        last_value = self.Q(self.last_sensation,self.last_action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f"
                      % (self.total_steps,rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f,"+
                       "delta = %.5f, terminal? = %d")
                      % (last_value,value,value-last_value,
                         delta,is_terminal(sensation)))        

        if self.allow_learning:
            self.update_Q(self.last_sensation,self.last_action,delta)

        self.last_sensation = sensation
        self.last_action = next_action
        if isinstance(reward,list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1

        return next_action
    def sarsa_step(self, sensation, reward=None):
        """
        Do a step using the SARSA update method.  Selects an action,
        computes the TD update and calls self.update_Q.  Returns the
        agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        rho = self.rho(reward)
        next_action = self.policy(sensation)

        if is_terminal(sensation):
            value = 0
        else:
            value = self.Q(sensation, next_action)

        last_value = self.Q(self.last_sensation, self.last_action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f" %
                     (self.total_steps, rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," +
                      "delta = %.5f, terminal? = %d") %
                     (last_value, value, value - last_value, delta,
                      is_terminal(sensation)))

        if self.allow_learning:
            self.update_Q(self.last_sensation, self.last_action, delta)

        self.last_sensation = sensation
        self.last_action = next_action
        if isinstance(reward, list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1

        return next_action
Esempio n. 13
0
    def policy(self,sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation,actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0
    def policy(self, sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation, actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0
    def __call__(self, sensation, reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        step_fn = getattr(self, self.step_method + '_step')

        action_index = step_fn(sensation, reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write( ` sensation ` + '\n')
            self._history_file.write( ` reward ` + '\n')
            if not is_terminal(sensation):
                self._history_file.write( ` action_index ` + '\n')
        return self.actions[action_index]
Esempio n. 16
0
    def __call__(self,sensation,reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        step_fn = getattr(self,self.step_method+'_step')

        action_index = step_fn(sensation,reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write(`sensation`+'\n')
            self._history_file.write(`reward`+'\n')
            if not is_terminal(sensation):
                self._history_file.write(`action_index`+'\n')
        return self.actions[action_index]
Esempio n. 17
0
    def __call__(self,sensation,reward=None):
        if reward is None:
            self._memory = [sensation]
        else:
            self._memory.append(sensation)

        if is_terminal(sensation):
            return super(TabularMemoryTDAgent,self).__call__(sensation,reward)
        else:
            action = super(TabularMemoryTDAgent,self).__call__(tuple(self._memory),reward)
            assert self.actions[self.last_action] == action
            self._memory.append(self.last_action)

            if len(self._memory) > (2*self.memory_steps + 1):
                del self._memory[0:2]

            return action
    def __call__(self, sensation, reward=None):
        if reward is None:
            self._memory = [sensation]
        else:
            self._memory.append(sensation)

        if is_terminal(sensation):
            return super(TabularMemoryTDAgent,
                         self).__call__(sensation, reward)
        else:
            action = super(TabularMemoryTDAgent,
                           self).__call__(tuple(self._memory), reward)
            assert self.actions[self.last_action] == action
            self._memory.append(self.last_action)

            if len(self._memory) > (2 * self.memory_steps + 1):
                del self._memory[0:2]

            return action
    def __call__(self,sensation,reward=None):
        if reward == None:
            self.stack = Stack([])
            self.push_option(self.root_option)
    
        self.last_sensation = sensation
        self.last_reward = reward

        if rl.is_terminal(sensation):
            # unwind the stack giving everyone the current reward
            # TODO: when options get their own separate rewards, this may change
            while not self.stack.empty():
                option,reward_list = self.stack.pop()
                option(sensation,reward_list+[option.reward(sensation,reward)])
            return None
        else:
            for option,rewards in self.stack[:-1]:
                rewards.append(option.reward(sensation,reward))
            option,rewards = self.stack.top()
            return  self.haction(option(sensation,option.reward(sensation,reward)))
Esempio n. 20
0
    def __call__(self, sensation, reward=None):
        if reward == None:
            self.stack = Stack([])
            self.push_option(self.root_option)

        self.last_sensation = sensation
        self.last_reward = reward

        if rl.is_terminal(sensation):
            # unwind the stack giving everyone the current reward
            # TODO: when options get their own separate rewards, this may change
            while not self.stack.empty():
                option, reward_list = self.stack.pop()
                option(sensation, reward_list + [option.reward(sensation, reward)])
            return None
        else:
            for option, rewards in self.stack[:-1]:
                rewards.append(option.reward(sensation, reward))
            option, rewards = self.stack.top()
            return self.haction(option(sensation, option.reward(sensation, reward)))
Esempio n. 21
0
 def observe(self):
     if self.actionQueue.empty() and not self.observe_wait:
         print 'perform:obs', 'Observe', time.localtime()
         self.observe_wait = 'Active Observe'
         self.actionQueue.put(('Observe', time.localtime()))
     observed,reward =  self.observationQueue.get()
     # Catch State change report
     if reward == None: # End of episode
         self.observed = self.reward = None
         print 'Queue.observe() => observed', observed, 'reward', reward,self.observe_wait
         if self.observe_wait not in ('Active Observe', ):
             self.observe_wait = False
     if rl.is_terminal(observed):
         print 'Queue.observe() observed is terminal:', observed,self.observe_wait
         self.observed,self.reward = [],reward
         observed = None
         if self.observe_wait not in ('Active Observe','State'):
             self.observe_wait = 'Terminal'
             if not self.actionQueue.full():
                 time.sleep(0.1)
                 print 'perform:obs', 'Observe', time.localtime()
                 self.actionQueue.put(('Observe', time.localtime()))
     if type(observed) == str:
         observed = self.str2meaning(observed)
     if self.is_state(observed):
         self.pomdp_state = observed
         print 'Queue.observe() observed is a state:', observed,self.observe_wait
         observed = None
         if self.observe_wait in ('State','Terminal'):
             self.observe_wait = False
     while not observed:
         observed = self.observe()
     self.observe_wait = False
     self.observed,self.reward = observed,reward
     if not self.reward: self.reward = 0
     return self.observed
Esempio n. 22
0
 def observe(self):
     if self.actionQueue.empty() and not self.observe_wait:
         print 'perform:obs', 'Observe', time.localtime()
         self.observe_wait = 'Active Observe'
         self.actionQueue.put(('Observe', time.localtime()))
     observed, reward = self.observationQueue.get()
     # Catch State change report
     if reward == None:  # End of episode
         self.observed = self.reward = None
         print 'Queue.observe() => observed', observed, 'reward', reward, self.observe_wait
         if self.observe_wait not in ('Active Observe', ):
             self.observe_wait = False
     if rl.is_terminal(observed):
         print 'Queue.observe() observed is terminal:', observed, self.observe_wait
         self.observed, self.reward = [], reward
         observed = None
         if self.observe_wait not in ('Active Observe', 'State'):
             self.observe_wait = 'Terminal'
             if not self.actionQueue.full():
                 time.sleep(0.1)
                 print 'perform:obs', 'Observe', time.localtime()
                 self.actionQueue.put(('Observe', time.localtime()))
     if type(observed) == str:
         observed = self.str2meaning(observed)
     if self.is_state(observed):
         self.pomdp_state = observed
         print 'Queue.observe() observed is a state:', observed, self.observe_wait
         observed = None
         if self.observe_wait in ('State', 'Terminal'):
             self.observe_wait = False
     while not observed:
         observed = self.observe()
     self.observe_wait = False
     self.observed, self.reward = observed, reward
     if not self.reward: self.reward = 0
     return self.observed
Esempio n. 23
0
 def __call__(self, sensation, reward=None):
     result = self.agent(sensation, reward)
     if rl.is_terminal(sensation):
         result = OPTION_TERMINATED
     return result
 def __call__(self,sensation,reward=None):
     result = self.agent(sensation,reward)
     if rl.is_terminal(sensation):
         result = OPTION_TERMINATED
     return result