def sample_episode(self, Q: Tabular, task: Task, policy: Policy, rewards): # to store the episode episode = [None] * (self.episode_length) # initialize state state = task.initial_state() # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, state) # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward episode[t] = (state, action, reward) # update state state = new_state # until state is terminal if done: break return t, episode[0:t]
def run_episode(self, Q : Tabular, task : Task, policy : Policy): # to compute backup rewards = np.zeros(self.episode_length, dtype=float) # initialize state state = task.initial_state() # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, state) # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # update Q delta = reward + self.gamma * Q.max_value(new_state) - Q.values(state)[action] Q.update(state, action, delta) # update state state = new_state # until state is terminal if done: break return t, rewards[0:t]
def run_episode(self, Q: Tabular, task: Task, policy: Policy): # to compute backups rewards = np.zeros(self.episode_length, dtype=float) states = [None] * self.episode_length actions = [None] * self.episode_length # initialize state state = task.initial_state() # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, state) states[t], actions[t] = state, action # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # update state and action state = new_state # until state is terminal if done: break # initialize lambda-average of returns lambda_return = reward # repeat for each step of episode in reverse order T = t for t in range(T, -1, -1): # compute lambda-average of returns if t < T: lambda_return = rewards[t] + self.gamma * ( (1.0 - self.decay) * Q.values(states[t + 1])[actions[t + 1]] + self.decay * lambda_return) # update Q delta = lambda_return - Q.values(states[t])[actions[t]] Q.update(states[t], actions[t], delta) return T, rewards[0:T]
def run_episode(self, Q: Tabular, task: Task, policy: Policy): # to compute backups rewards = np.zeros(self.episode_length, dtype=float) # initialize the e(s, a) matrix # note: there is an error in Sutton and Barto since e is reset each episode e = defaultdict(lambda: np.zeros(task.valid_actions(), dtype=float)) # initialize state and action state = task.initial_state() action = policy.act(Q, task, state) # repeat for each step of episode for t in range(self.episode_length): # take action and observe reward and new state new_state, reward, done = task.transition(state, action) rewards[t] = reward # choose action from state using policy derived from Q new_action = policy.act(Q, task, new_state) # update e e[state][action] += 1.0 # update trace delta = reward + self.gamma * Q.values( new_state)[new_action] - Q.values(state)[action] for s in e.keys(): errors = e[s] * delta Q.update_all(s, errors) e[s] *= self.gamma * self.decay # update state and action state, action = new_state, new_action # until state is terminal if done: break return t, rewards[0:t]
def run_episode(self, Q: Neural, task: Task, policy: Policy): # to compute backup rewards = np.zeros(self.episode_length, dtype=float) # initialize state state = task.initial_state() phi_state = self.phi(state) # repeat for each step of episode for t in range(self.episode_length): # choose action from state using policy derived from Q action = policy.act(Q, task, phi_state) # take action and observe reward and new state new_state, reward, done = task.transition(state, action) phi_new_state = self.phi(new_state) rewards[t] = reward # store the transition in memory self.memory.remember(phi_state, action, reward, phi_new_state, done) # sample a mini-batch of transitions from memory # compute the targets y_j and train the network mini_batch = self.memory.sample_batch() if mini_batch is not None: Q.train(mini_batch, self.gamma) # update state state, phi_state = new_state, phi_new_state # until state is terminal if done: break return t, rewards[0:t]
def act(self, Q: Agent, task: Task, state): # set the number of actions of the current task, if not set if self.valid_actions == 0: self.valid_actions = task.valid_actions() # get the distribution over actions for the current state pref = self.preferences[state] # sample an action from the preference distribution action = np.random.choice(self.valid_actions, 1, p=pref) # get the greedy action according to Q greedy = Q.max_action(state) # update the preference distribution pref *= (1.0 - self.beta) pref[greedy] /= (1.0 - self.beta) pref[greedy] += self.beta * (1.0 - pref[greedy]) return action
def act(self, Q: Agent, task: Task, state): values = self.distribution(Q, task, state) return np.random.choice(task.valid_actions(), 1, p=values)