def run(self): """ Run the evaluation. """ t = 0 while t < self.T: self.e_trace.clear() s = generate_initial_state() a = self._choose_action(s) while True: for i in range(len(_feature_space)): self.e_trace[i] *= self.lambda_ f_a = _phi(s, a) for i in f_a: self.e_trace[i] += 1 s1, r = step(s, a) delta = r - self.q(s, a) if is_episode_terminated(r, a): self._update_theta(delta) break s = s1 a = self._choose_action(s) delta += self.q(s, a) self._update_theta(delta) self.learning_curve.append((t, self.extract_q())) t += 1 return self
def generate_episode(policy): """ Returns a generator which can be iterated to reveal successive game steps. Calling yield on this generator will return a 5-tuple: (s, a, r, s1, a1) where: s - the initial state a - the action taken in s r - the reward received s1 - the new state after a is performed a1 - a new action drawn from the policy in state s1 :param policy: the policy pi(s, a) :return: a game episode generator """ s = generate_initial_state() a = draw_action(s, policy) while True: s1, r = step(s, a) a1 = draw_action(s1, policy) yield (s, a, r, s1, a1) # Episode ends after we stick or lose, whichever comes first if is_episode_terminated(r, a): break s = s1 a = a1