def _q_learning_training(self, sensation, action, reward, next_sensation, next_action=None): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose( "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho, last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta, on_policy=(last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation, action) - last_value) / delta < 1.0
def __call__(self, sensation, reward=None): if not is_terminal(sensation): sensation = tile_uniform_rfs( array(sensation) / self.tile_width, self.num_rfs, self.rf_width, self.num_tilings, self.num_features / self.num_rfs) return super(UniformTiledAgent, self).__call__(sensation, reward)
def __call__(self, sensation, reward=None): if is_terminal(sensation): new_sensation = sensation else: new_sensation = zeros(self.num_features, 'f') for f in sensation: new_sensation[f] = 1 return super(LinearListAgent, self).__call__(new_sensation, reward)
def __call__(self, sensation, reward=None): if not is_terminal(sensation): assert (type(sensation) == int) s = zeros(self.num_features) s[sensation] = 1.0 else: s = sensation return super(LinearTabularTDAgent, self).__call__(s, reward)
def __call__(self,sensation,reward=None): if reward == None: self.setup_gen() if not rl.is_terminal(sensation): try: return self.gen.next() except StopIteration: return OPTION_TERMINATED
def _sarsa_training(self, sensation, action, reward, next_sensation, next_action): """ Perform a single SARSA training step given (s,a,r,s',a'). """ rho = self.rho(reward) if is_terminal(next_sensation): value = 0 else: value = self.Q(next_sensation, next_action) last_value = self.Q(sensation, action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps, rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," + "delta = %.5f, terminal? = %d") % (last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta)
def policy(self, sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation, actions))] else: # In the terminal state, the action is irrelevant return 0
def __call__(self, sensation, reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ training_fn = getattr(self, '_' + self.step_method + '_training') action_index = self.learning_step(training_fn, sensation, reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write( ` sensation ` + '\n') self._history_file.write( ` reward ` + '\n') if not is_terminal(sensation): self._history_file.write( ` action_index ` + '\n') return self.actions[action_index]
def __call__(self, sensation, reward=None): if reward is None: self._memory = [sensation] else: self._memory.append(sensation) if is_terminal(sensation): return super(TabularMemoryTDAgent, self).__call__(sensation, reward) else: action = super(TabularMemoryTDAgent, self).__call__(tuple(self._memory), reward) assert self.actions[self.last_action] == action self._memory.append(self.last_action) if len(self._memory) > (2 * self.memory_steps + 1): del self._memory[0:2] return action
def __call__(self,sensation,reward=None): if reward == None: self.stack = Stack([]) self.push_option(self.root_option) self.last_sensation = sensation self.last_reward = reward if rl.is_terminal(sensation): # unwind the stack giving everyone the current reward # TODO: when options get their own separate rewards, this may change while not self.stack.empty(): option,reward_list = self.stack.pop() option(sensation,reward_list+[option.reward(sensation,reward)]) return None else: for option,rewards in self.stack[:-1]: rewards.append(option.reward(sensation,reward)) option,rewards = self.stack.top() return self.haction(option(sensation,option.reward(sensation,reward)))
def __call__(self,sensation,reward=None): result = self.agent(sensation,reward) if rl.is_terminal(sensation): result = OPTION_TERMINATED return result