Exemple #1
0
 def episode(self):
     s = self.env.get_initial_state()
     a = self.agent.start(s)
     ep = []
     while True:
         r, s_prime, terminal = self.env.step(s, a)
         self.model_append(s, a, r, s_prime)
         ep.append((s, a, r))
         if len(ep) == self.max_len:
             return ep
         if terminal:
             self.agent.end(r)
             break
         a = self.agent.step(r, s_prime)
         s = s_prime
         # Planning
         for i in range(self.n):
             (pl_s, pl_a, pl_r, pl_s_prime) = self.model_sample()
             if pl_s_prime:
                 a1 = misc.argmax_unique(self.agent.q[pl_s_prime])
                 self.agent.q[pl_s][pl_a] += self.agent.alpha * (
                     pl_r + self.agent.gamma * self.agent.q[pl_s_prime][a1]
                     - self.agent.q[pl_s][pl_a])
             else:
                 self.agent.q[pl_s][pl_a] += self.agent.alpha * (
                     pl_r - self.agent.q[pl_s][pl_a])
             a0 = misc.argmax(self.agent.q[pl_s])
             self.agent.pi.update(pl_s, a0)
     return ep
Exemple #2
0
  def end(self, r):
    if np.random.rand()<0.5:
      self.q2[self.last_state][self.last_action] += self.alpha * (r - self.q1[self.last_state][self.last_action])
    else:
      self.q1[self.last_state][self.last_action] += self.alpha * (r - self.q2[self.last_state][self.last_action])

    tmp_dict = {a:self.q1[self.last_state][a]+self.q2[self.last_state][a] for a in self.q[self.last_state].keys()}
    best_actions = misc.argmax(tmp_dict)
    self.pi.update(self.last_state,best_actions)
Exemple #3
0
  def step(self, r, s):
    a = self.pi.get(s)
    exp_val = sum(self.pi.prob(a1,s) * val for (a1,val) in self.q[s].items())
    self.q[self.last_state][self.last_action] += self.alpha * (r + self.gamma * exp_val - self.q[self.last_state][self.last_action])

    best_actions = misc.argmax(self.q[self.last_state])
    self.pi.update(self.last_state,best_actions)

    self.last_state = s
    self.last_action = a
    return self.last_action
Exemple #4
0
  def step(self, r, s):
    a = self.pi.get(s)
    a1 = misc.argmax_unique(self.q[s])
    self.q[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q[s][a1] - self.q[self.last_state][self.last_action])

    best_actions = misc.argmax(self.q[self.last_state])
    self.pi.update(self.last_state,best_actions)

    self.last_state = s
    self.last_action = a
    return self.last_action
Exemple #5
0
 def train(self):
     for i in tqdm.tqdm(range(self.n_episodes), disable=self.disable):
         ep = self.episode()
         G = 0
         set_sa = [(s, a) for (s, a, r) in ep[::-1]]
         for idx, (s, a, r) in enumerate(ep[::-1]):
             G = self.gamma * G + r
             if (s, a) not in set_sa[idx + 1:]:
                 self.n_visits[s][a] += 1
                 self.Q[s][a] += 1. / self.n_visits[s][a] * (G -
                                                             self.Q[s][a])
                 best_actions = misc.argmax(self.Q[s])
                 self.agent.pi.update(s, best_actions)
Exemple #6
0
  def step(self, r, s):
    a = self.pi.get(s)
    if np.random.rand()<0.5:
      a1 = misc.argmax_unique(self.q1[s])
      self.q2[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q2[s][a1] - self.q1[self.last_state][self.last_action])
    else:
      a1 = misc.argmax_unique(self.q2[s])
      self.q1[self.last_state][self.last_action] += self.alpha * (r + self.gamma * self.q1[s][a1] - self.q2[self.last_state][self.last_action])

    tmp_dict = {a:self.q1[self.last_state][a]+self.q2[self.last_state][a] for a in self.q[self.last_state].keys()}
    best_actions = misc.argmax(tmp_dict)
    self.pi.update(self.last_state,best_actions)

    self.last_state = s
    self.last_action = a
    return self.last_action
Exemple #7
0
  def step(self, r, s):
    self.t += 1
    self.nv[self.last_state] += 1
    self.nq[self.last_state][self.last_action] += 1

    if self.alpha:
      stepsize = self.alpha
    else:
      stepsize = 1 / self.nq[self.last_state][self.last_action]
    self.q[self.last_state][self.last_action] += stepsize * (r - self.q[self.last_state][self.last_action])

    if self.ucb_c:
      tmp = {a : v + self.ucb_c*(math.log(self.t) / (1e-5+self.nq[self.last_state][a]))**.5 for (a,v) in self.q[self.last_state].items()}
    else:
      tmp = self.q[self.last_state]

    self.pi.update(self.last_state,misc.argmax(tmp))
    self.last_action = self.pi.get(s)
    return self.last_action
Exemple #8
0
  def end(self, r):
    self.q[self.last_state][self.last_action] += self.alpha * (r - self.q[self.last_state][self.last_action])

    best_actions = misc.argmax(self.q[self.last_state])
    self.pi.update(self.last_state,best_actions)