def _q_learning_training(self, sensation, action, reward, next_sensation): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose( "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho, last_value, value, value - last_value, delta, is_terminal(next_sensation))) self.update_Q(sensation, action, delta, on_policy=(last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation, action) - last_value) / delta < 1.0
def _q_learning_training(self,sensation,action,reward,next_sensation): """ Do a single Q-lambda training step given (s,a,r,s'). Can be called from outside the q_learning_step method for off-policy training, experience replay, etc. """ rho = self.rho(reward) last_Q = self.Q(sensation) last_value = last_Q[action] if is_terminal(next_sensation): value = 0 else: value = max(self.Q(next_sensation)) delta = rho + (self.gamma * value - last_value) self.verbose("r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d" % (rho,last_value,value,value-last_value,delta,is_terminal(next_sensation))) self.update_Q(sensation,action,delta,on_policy = (last_Q[action] == max(last_Q))) if delta: assert (self.Q(sensation,action) - last_value)/delta < 1.0
def __call__(self, sensation, reward=None): if not is_terminal(sensation): sensation = tile_uniform_rfs( array(sensation) / self.tile_width, self.num_rfs, self.rf_width, self.num_tilings, self.num_features / self.num_rfs) return super(UniformTiledAgent, self).__call__(sensation, reward)
def __call__(self,sensation,reward=None): if reward == None: self.setup_gen() if not rl.is_terminal(sensation): try: return self.gen.next() except StopIteration: return OPTION_TERMINATED
def __call__(self, sensation, reward=None): if reward == None: self.setup_gen() if not rl.is_terminal(sensation): try: return self.gen.next() except StopIteration: return OPTION_TERMINATED
def __call__(self,sensation,reward=None): if not is_terminal(sensation): assert(type(sensation) == int) s = zeros(self.num_features) s[sensation] = 1.0 else: s = sensation return super(LinearTabularTDAgent,self).__call__(s,reward)
def __call__(self, sensation, reward=None): if is_terminal(sensation): new_sensation = sensation else: new_sensation = zeros(self.num_features, 'f') for f in sensation: new_sensation[f] = 1 return super(LinearListAgent, self).__call__(new_sensation, reward)
def __call__(self,sensation,reward=None): if not is_terminal(sensation): sensation = tile_uniform_rfs(array(sensation)/self.tile_width, self.num_rfs, self.rf_width, self.num_tilings, self.num_features/self.num_rfs) return super(UniformTiledAgent,self).__call__(sensation,reward)
def __call__(self, sensation, reward=None): if not is_terminal(sensation): assert (type(sensation) == int) s = zeros(self.num_features) s[sensation] = 1.0 else: s = sensation return super(LinearTabularTDAgent, self).__call__(s, reward)
def __call__(self,sensation,reward=None): if is_terminal(sensation): new_sensation = sensation else: new_sensation = zeros(self.num_features,'f') for f in sensation: new_sensation[f] = 1 return super(LinearListAgent,self).__call__(new_sensation,reward)
def sarsa_step(self,sensation,reward=None): """ Do a step using the SARSA update method. Selects an action, computes the TD update and calls self.update_Q. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) rho = self.rho(reward) next_action = self.policy(sensation) if is_terminal(sensation): value = 0 else: value = self.Q(sensation,next_action) last_value = self.Q(self.last_sensation,self.last_action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps,rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f,"+ "delta = %.5f, terminal? = %d") % (last_value,value,value-last_value, delta,is_terminal(sensation))) if self.allow_learning: self.update_Q(self.last_sensation,self.last_action,delta) self.last_sensation = sensation self.last_action = next_action if isinstance(reward,list): self.total_steps += len(reward) else: self.total_steps += 1 return next_action
def sarsa_step(self, sensation, reward=None): """ Do a step using the SARSA update method. Selects an action, computes the TD update and calls self.update_Q. Returns the agent's next action. """ if reward == None: return self._start_episode(sensation) rho = self.rho(reward) next_action = self.policy(sensation) if is_terminal(sensation): value = 0 else: value = self.Q(sensation, next_action) last_value = self.Q(self.last_sensation, self.last_action) delta = rho + (self.gamma * value - last_value) self.verbose("controller step = %d, rho = %.2f" % (self.total_steps, rho)) self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," + "delta = %.5f, terminal? = %d") % (last_value, value, value - last_value, delta, is_terminal(sensation))) if self.allow_learning: self.update_Q(self.last_sensation, self.last_action, delta) self.last_sensation = sensation self.last_action = next_action if isinstance(reward, list): self.total_steps += len(reward) else: self.total_steps += 1 return next_action
def policy(self,sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation,actions))] else: # In the terminal state, the action is irrelevant return 0
def policy(self, sensation): """ Given a sensation, return an action. Uses self.action_selection to get a distribution over the agent's actions. Uses self.applicable_actions to prevent selecting inapplicable actions. Returns 0 if is_terminal(sensation). """ if not is_terminal(sensation): actions = self.applicable_actions(sensation) return actions[weighted_sample(self.policy_fn(sensation, actions))] else: # In the terminal state, the action is irrelevant return 0
def __call__(self, sensation, reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ step_fn = getattr(self, self.step_method + '_step') action_index = step_fn(sensation, reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write( ` sensation ` + '\n') self._history_file.write( ` reward ` + '\n') if not is_terminal(sensation): self._history_file.write( ` action_index ` + '\n') return self.actions[action_index]
def __call__(self,sensation,reward=None): """ Do a step. Calls the function selected in self.step_method and returns the action. """ step_fn = getattr(self,self.step_method+'_step') action_index = step_fn(sensation,reward) if self.history_log: if reward is None: self._history_file.write('start\n') self._history_file.write(`sensation`+'\n') self._history_file.write(`reward`+'\n') if not is_terminal(sensation): self._history_file.write(`action_index`+'\n') return self.actions[action_index]
def __call__(self,sensation,reward=None): if reward is None: self._memory = [sensation] else: self._memory.append(sensation) if is_terminal(sensation): return super(TabularMemoryTDAgent,self).__call__(sensation,reward) else: action = super(TabularMemoryTDAgent,self).__call__(tuple(self._memory),reward) assert self.actions[self.last_action] == action self._memory.append(self.last_action) if len(self._memory) > (2*self.memory_steps + 1): del self._memory[0:2] return action
def __call__(self, sensation, reward=None): if reward is None: self._memory = [sensation] else: self._memory.append(sensation) if is_terminal(sensation): return super(TabularMemoryTDAgent, self).__call__(sensation, reward) else: action = super(TabularMemoryTDAgent, self).__call__(tuple(self._memory), reward) assert self.actions[self.last_action] == action self._memory.append(self.last_action) if len(self._memory) > (2 * self.memory_steps + 1): del self._memory[0:2] return action
def __call__(self,sensation,reward=None): if reward == None: self.stack = Stack([]) self.push_option(self.root_option) self.last_sensation = sensation self.last_reward = reward if rl.is_terminal(sensation): # unwind the stack giving everyone the current reward # TODO: when options get their own separate rewards, this may change while not self.stack.empty(): option,reward_list = self.stack.pop() option(sensation,reward_list+[option.reward(sensation,reward)]) return None else: for option,rewards in self.stack[:-1]: rewards.append(option.reward(sensation,reward)) option,rewards = self.stack.top() return self.haction(option(sensation,option.reward(sensation,reward)))
def __call__(self, sensation, reward=None): if reward == None: self.stack = Stack([]) self.push_option(self.root_option) self.last_sensation = sensation self.last_reward = reward if rl.is_terminal(sensation): # unwind the stack giving everyone the current reward # TODO: when options get their own separate rewards, this may change while not self.stack.empty(): option, reward_list = self.stack.pop() option(sensation, reward_list + [option.reward(sensation, reward)]) return None else: for option, rewards in self.stack[:-1]: rewards.append(option.reward(sensation, reward)) option, rewards = self.stack.top() return self.haction(option(sensation, option.reward(sensation, reward)))
def observe(self): if self.actionQueue.empty() and not self.observe_wait: print 'perform:obs', 'Observe', time.localtime() self.observe_wait = 'Active Observe' self.actionQueue.put(('Observe', time.localtime())) observed,reward = self.observationQueue.get() # Catch State change report if reward == None: # End of episode self.observed = self.reward = None print 'Queue.observe() => observed', observed, 'reward', reward,self.observe_wait if self.observe_wait not in ('Active Observe', ): self.observe_wait = False if rl.is_terminal(observed): print 'Queue.observe() observed is terminal:', observed,self.observe_wait self.observed,self.reward = [],reward observed = None if self.observe_wait not in ('Active Observe','State'): self.observe_wait = 'Terminal' if not self.actionQueue.full(): time.sleep(0.1) print 'perform:obs', 'Observe', time.localtime() self.actionQueue.put(('Observe', time.localtime())) if type(observed) == str: observed = self.str2meaning(observed) if self.is_state(observed): self.pomdp_state = observed print 'Queue.observe() observed is a state:', observed,self.observe_wait observed = None if self.observe_wait in ('State','Terminal'): self.observe_wait = False while not observed: observed = self.observe() self.observe_wait = False self.observed,self.reward = observed,reward if not self.reward: self.reward = 0 return self.observed
def observe(self): if self.actionQueue.empty() and not self.observe_wait: print 'perform:obs', 'Observe', time.localtime() self.observe_wait = 'Active Observe' self.actionQueue.put(('Observe', time.localtime())) observed, reward = self.observationQueue.get() # Catch State change report if reward == None: # End of episode self.observed = self.reward = None print 'Queue.observe() => observed', observed, 'reward', reward, self.observe_wait if self.observe_wait not in ('Active Observe', ): self.observe_wait = False if rl.is_terminal(observed): print 'Queue.observe() observed is terminal:', observed, self.observe_wait self.observed, self.reward = [], reward observed = None if self.observe_wait not in ('Active Observe', 'State'): self.observe_wait = 'Terminal' if not self.actionQueue.full(): time.sleep(0.1) print 'perform:obs', 'Observe', time.localtime() self.actionQueue.put(('Observe', time.localtime())) if type(observed) == str: observed = self.str2meaning(observed) if self.is_state(observed): self.pomdp_state = observed print 'Queue.observe() observed is a state:', observed, self.observe_wait observed = None if self.observe_wait in ('State', 'Terminal'): self.observe_wait = False while not observed: observed = self.observe() self.observe_wait = False self.observed, self.reward = observed, reward if not self.reward: self.reward = 0 return self.observed
def __call__(self, sensation, reward=None): result = self.agent(sensation, reward) if rl.is_terminal(sensation): result = OPTION_TERMINATED return result
def __call__(self,sensation,reward=None): result = self.agent(sensation,reward) if rl.is_terminal(sensation): result = OPTION_TERMINATED return result