def play_game(grid, policy): # returns a list of states and corresponding rewards (not returns!) # Start at the designated start state s = (2, 0) grid.set_state(s) states_and_rewards = [(s, 0)] while not grid.game_over(): a = policy[s] a = random_action(a) r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
alpha = ALPHA / t2 #we play instead of generating an episode s = (2, 0) grid.set_state(s) #get Q(s) to choose first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5 / t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() #need next action since Q(s,a) depends on Q(s',a') old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad( s, a) else: #not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0] a2 = random_action(a2, eps=0.5 / t) #epsilon greedy
print("it:", it) # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a, _ = max_dict(Q[s]) biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5 / t) # epsilon-greedy # random action also works, but slower since you can bump into walls # a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() # we will update Q(s,a) AS we experience the episode old_qsa = Q[s][a] # the difference between SARSA and Q-Learning is with Q-Learning # we will use this max[a']{ Q(s',a')} in our update # even if we do not end up taking this action in the next step a2, max_q_s2a2 = max_dict(Q[s2]) Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * max_q_s2a2 - Q[s][a]) biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a])) # we would like to know how often Q(s) has been updated too
for i in range(10000): if i % 100 == 0: t += 10e-3 if t % 2000 == 0: print('i', i) #start state s = (2,0) grid.set_state(s) #first (s,r) tuple is the state we start in and 0 for r #the last (s,r) tuple is terminal so r is 0 and we dont care to update it a = max_dict(Q[s])[0] a = random_action(a, eps = 0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() #we need the next action as well since Q(s,a) depends on Q(s',a') #if s2 not in policy then terminal state and all Q are 0 a2 = max_dict(Q[s2])[0] a2 = random_action(a2, eps = 0.5/t) #we update Q(s,a) as wel experience the episode alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + alpha * (r+ GAMMA*Q[s2][a2] - Q[s][a])
for a in ALL_POSSIBLE_ACTIONS: update_counts_sa[s][a] = 1.0 t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 10e-3 if it % 2000 == 0: print('it:', it) s = (2,0) grid.set_state(s) a = max_dict(Q[s])[0] a = random_action(a, eps = 0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s1 = grid.current_state() a1 = max_dict(Q[s1])[0] a1 = random_action(a1, eps = 0.5/t) alpha = ALPHA/update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + ALPHA*(r + GAMMA*Q[s1][a1] - Q[s][a]) biggest_change = max(biggest_change, np.abs(Q[s][a] - old_qsa))
t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 10e-3 if it % 2000 == 0: print ('it:', it) s = (2,0) grid.set_state(s) a = max_dict(Q[s])[0] biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5/t) #a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() # update Q(s,a) as we experience the episode alpha = ALPHA/update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] # the difference between SARSA and Q-Learning is with # Q-Learning we will use max[a']{Q(s', a')} in our update # even if we do not end up taking this action in the next step a2, max_q_s2a2 = max_dict(Q[s2]) Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q_s2a2 - Q[s][a]) biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
print "it:", it # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a, _ = max_dict(Q[s]) biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5/t) # epsilon-greedy # random action also works, but slower since you can bump into walls # a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() # adaptive learning rate alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 # we will update Q(s,a) AS we experience the episode old_qsa = Q[s][a] # the difference between SARSA and Q-Learning is with Q-Learning # we will use this max[a']{ Q(s',a')} in our update # even if we do not end up taking this action in the next step a2, max_q_s2a2 = max_dict(Q[s2])