def episode(self): s = self.env.get_initial_state() a = self.agent.start(s) ep = [] while True: r, s_prime, terminal = self.env.step(s, a) self.model_append(s, a, r, s_prime) ep.append((s, a, r)) if len(ep) == self.max_len: return ep if terminal: self.agent.end(r) break a = self.agent.step(r, s_prime) s = s_prime # Planning for i in range(self.n): (pl_s, pl_a, pl_r, pl_s_prime) = self.model_sample() if pl_s_prime: a1 = misc.argmax_unique(self.agent.q[pl_s_prime]) self.agent.q[pl_s][pl_a] += self.agent.alpha * ( pl_r + self.agent.gamma * self.agent.q[pl_s_prime][a1] - self.agent.q[pl_s][pl_a]) else: self.agent.q[pl_s][pl_a] += self.agent.alpha * ( pl_r - self.agent.q[pl_s][pl_a]) a0 = misc.argmax(self.agent.q[pl_s]) self.agent.pi.update(pl_s, a0) return ep
def end(self, r): if np.random.rand()<0.5: self.q2[self.last_state][self.last_action] += self.alpha * (r - self.q1[self.last_state][self.last_action]) else: self.q1[self.last_state][self.last_action] += self.alpha * (r - self.q2[self.last_state][self.last_action]) tmp_dict = {a:self.q1[self.last_state][a]+self.q2[self.last_state][a] for a in self.q[self.last_state].keys()} best_actions = misc.argmax(tmp_dict) self.pi.update(self.last_state,best_actions)
def step(self, r, s): a = self.pi.get(s) exp_val = sum(self.pi.prob(a1,s) * val for (a1,val) in self.q[s].items()) self.q[self.last_state][self.last_action] += self.alpha * (r + self.gamma * exp_val - self.q[self.last_state][self.last_action]) best_actions = misc.argmax(self.q[self.last_state]) self.pi.update(self.last_state,best_actions) self.last_state = s self.last_action = a return self.last_action
def step(self, r, s): a = self.pi.get(s) a1 = misc.argmax_unique(self.q[s]) self.q[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q[s][a1] - self.q[self.last_state][self.last_action]) best_actions = misc.argmax(self.q[self.last_state]) self.pi.update(self.last_state,best_actions) self.last_state = s self.last_action = a return self.last_action
def train(self): for i in tqdm.tqdm(range(self.n_episodes), disable=self.disable): ep = self.episode() G = 0 set_sa = [(s, a) for (s, a, r) in ep[::-1]] for idx, (s, a, r) in enumerate(ep[::-1]): G = self.gamma * G + r if (s, a) not in set_sa[idx + 1:]: self.n_visits[s][a] += 1 self.Q[s][a] += 1. / self.n_visits[s][a] * (G - self.Q[s][a]) best_actions = misc.argmax(self.Q[s]) self.agent.pi.update(s, best_actions)
def step(self, r, s): a = self.pi.get(s) if np.random.rand()<0.5: a1 = misc.argmax_unique(self.q1[s]) self.q2[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q2[s][a1] - self.q1[self.last_state][self.last_action]) else: a1 = misc.argmax_unique(self.q2[s]) self.q1[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q1[s][a1] - self.q2[self.last_state][self.last_action]) tmp_dict = {a:self.q1[self.last_state][a]+self.q2[self.last_state][a] for a in self.q[self.last_state].keys()} best_actions = misc.argmax(tmp_dict) self.pi.update(self.last_state,best_actions) self.last_state = s self.last_action = a return self.last_action
def step(self, r, s): self.t += 1 self.nv[self.last_state] += 1 self.nq[self.last_state][self.last_action] += 1 if self.alpha: stepsize = self.alpha else: stepsize = 1 / self.nq[self.last_state][self.last_action] self.q[self.last_state][self.last_action] += stepsize * (r - self.q[self.last_state][self.last_action]) if self.ucb_c: tmp = {a : v + self.ucb_c*(math.log(self.t) / (1e-5+self.nq[self.last_state][a]))**.5 for (a,v) in self.q[self.last_state].items()} else: tmp = self.q[self.last_state] self.pi.update(self.last_state,misc.argmax(tmp)) self.last_action = self.pi.get(s) return self.last_action
def end(self, r): self.q[self.last_state][self.last_action] += self.alpha * (r - self.q[self.last_state][self.last_action]) best_actions = misc.argmax(self.q[self.last_state]) self.pi.update(self.last_state,best_actions)