def on_mc(): grid_size = 4 env = Env(grid_size) policy = EspionGreedyPolicy(env.actions(), range(grid_size**2)) Q = defaultdict(float) R = defaultdict(list) for i in range(5000): G = 0 states = get_episode(env, policy) for (s0, a, s1, r) in reversed(states): G = 0.9 * G + r R[(s0, a)].append(G) Q[(s0, a)] = sum(R[(s0, a)]) / len(R[(s0, a)]) for (s0, a, s1, r) in reversed(states): mm = [(x, Q[(s0, x)]) for x in env.actions()] action = max(mm, key=lambda x: x[1])[0] policy.set_max(s0, action) Pi = {} for i in range(grid_size**2): Pi[i] = policy.get_m(i) for t in env.get_t(): Pi[t] = 'ter' env.render(Pi)
def off_mc(): env = Env(6) policy = RandomPolicy(env.actions()) C = defaultdict(float) Q = defaultdict(float) Pi = {} for i in range(10000): G = 0 W = 1.0 n = 0 states = get_episode(env, policy) for (s0, a, s1, r) in reversed(states): n += 1 G = 0.9 * G + r C[(s0, a)] += W Q[(s0, a)] += W / C[(s0, a)] * (G - Q[(s0, a)]) Pi[s0] = max([(x, Q[(s0, x)]) for x in env.actions()], key=lambda x: x[1])[0] if a != Pi[s0]: break W = W / policy.get_p(s0, a) for t in env.get_t(): Pi[t] = 'ter' env.render(Pi)
def sarsa(): grid_size = 4 env = Env(grid_size) policy = EspionGreedyPolicy(env.actions(), range(grid_size**2)) Q = defaultdict(float) for i in range(5000): s0 = env.init() if env.is_t(s0): continue a0 = policy.get_a(s0) while not env.is_t(s0): s, r = env.step(a0) a = policy.get_a(s) Q[(s0, a0)] += 0.9 * (r + 0.9 * Q[(s, a)] - Q[(s0, a0)]) s0 = s a0 = a mm = [(x, Q[(s0, x)]) for x in env.actions()] action = max(mm, key=lambda x:x[1])[0] policy.set_max(s0, action) Pi = {} for i in range(grid_size**2): Pi[i] = policy.get_m(i) for t in env.get_t(): Pi[t] = 'ter' env.render(Pi)
class Sarsa(object): def __init__(self, size=4): self.grid_size = size self.env = Env(self.grid_size) self.a_id = dict([(a, i) for i, a in enumerate(self.env.actions())]) self.policy = EspionGreedyPolicy(self.env.actions(), range(self.grid_size**2)) def get_f(self, s, a): f = range(self.grid_size**2 + 4) f[s], f[self.a_id[a]] = 1, 1 return f def sarsa(self): policy = self.policy Q = SGDRegressor() f = self.get_f(1, 'left') Q.fit([f], [1]) for i in range(500): s0 = self.env.init() if self.env.is_t(s0): continue a0 = policy.get_a(s0) while not self.env.is_t(s0): s, r = self.env.step(a0) a = policy.get_a(s) f0 = self.get_f(s0, a0) f = self.get_f(s, a) target = Q.predict([f0])[0] + 0.9 * ( r + 0.9 * Q.predict([f])[0] - Q.predict([f0])[0]) Q.partial_fit([f], [target]) s0 = s a0 = a mm = [(x, Q.predict([self.get_f(s0, x)])[0]) for x in self.env.actions()] action = max(mm, key=lambda x: x[1])[0] policy.set_max(s0, action) Pi = {} for i in range(self.grid_size**2): Pi[i] = policy.get_m(i) for t in self.env.get_t(): Pi[t] = 'ter' self.env.render(Pi)