#throw out last element becuase it is a terminal state return list(reversed(state_returns))[:-1] if __name__ == '__main__': #this demos how to evaluate a policy by playing the game. grid = standard_grid() V = {} #policy to check POLICY = { (2, 0): 'U', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', (1, 0): 'U', (1, 2): 'U', #terminal L (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', #terminal win } print_policy(POLICY, grid) state_to_return_list = {} for s in grid.location_to_action.keys(): state_to_return_list[s] = [] start_state_list = list(grid.location_to_action.keys()) for i in range(1000): #run game test_state_index = np.random.choice(len(start_state_list)) state_returns = play_game_and_get_returns(start_state_list[test_state_index], grid, POLICY) eval_states = set() for s, G in state_returns:
else: Qs2 = getQs(model, s2) a2, MaxQs2a2 = max_dict(Qs2) a2 = random_action(a2, eps=0.5 / t) model.theta += alpha * (r + gamma * MaxQs2a2 - model.predict( s, a)) * model.grad(s, a) s = s2 a = a2 delta = max(delta, np.abs(old_theta - model.theta).sum()) deltas.append(delta) plt.plot(deltas) plt.show() #Find Policy and V function Policy = {} V = {} Q = {} for s in g.actions.keys(): Q[s] = getQs(model, s) a, max_q = max_dict(Q[s]) Policy[s] = a V[s] = max_q print("Values") print_values(V, g) print("Policy") print_policy(Policy, g)
break if __name__ == '__main__': grid = negative_grid(-0.3) print("rewards:") print_values(grid.location_to_rewards, grid) # intialize random policy. then update policy = {} for s in grid.location_to_action.keys(): policy[s] = np.random.choice(grid.location_to_action[s]) print("initial policy:") print_policy(policy, grid) V = initalize_V(grid) while True: #evaluate policy to find V evalulate_v_for_policy(policy, grid, V) """ Summary: change policy for biggest V Look at the policy. Now that we know V for the policy, see if we can update for more V """ if check_if_policy_converges(grid, V, policy): break print("finished") print_values(V, grid)
(0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } V = initalize_V(grid) for i in range(1000): #play game state_to_returns = play_game_return_state_rewards(grid, POLICY_TO_EVAL) #play_game_return_state_rewards(grid, POLICY_TO_EVAL) #calc V for j in range(len(state_to_returns) - 1): s, r0 = state_to_returns[j] s2, r = state_to_returns[j+1] V[s] = V[s] + ALPHA * (r + GAMMA * V[s2] - V[s]) #V[s] = V[s] + ALPHA*(r + GAMMA*V[s2] - V[s]) print("rewards") print_values(grid.location_to_rewards, grid) print("values:") print_values(V, grid) print("policy:") print_policy(POLICY_TO_EVAL, grid)