def test_grid_value_iteration(): pygame.init() w = 6 h = 5 rewards = ((24, 1), (2, -1), (11, -1), (27, -1)) terminal = [2, 11, 24] new_pos = {"top": 2, "bot": 3, "left": 0, "right": 1} S, A, T, P = create_grid_world(w, h, rewards, terminal) start_time = time() V, Pi = value_iteration(S, A, P, T) print("--- %s seconds ---" % (time() - start_time)) win = pygame.display.set_mode((w * 100, h * 100)) for i in range(w * h): if i % w == 0 and i != 0: print("") print(round(V[i], 7), end=" ") print("") for i in range(w * h): if i % w == 0 and i != 0: print("") print(Pi[i], end=" ") st = reset_grid(w, h) while not is_terminal(st, T): display_grid(win, w, h) event_loop() display_reward_grid(win, rewards, w, h) display_mouse_grid(win, st, w, h) sleep(1) positions = {"top": st - w, "bot": st + w, "left": st - 1, "right": st + 1 } positions_bis = {key: Pi[st][new_pos[key]] for key, value in positions.items() if 0 <= value < w * h and Pi[st][new_pos[key]] > 0} action = max(positions_bis, key=positions_bis.get) a = new_pos[action] st, r, term = step(st, a, T, S, P) display_grid(win, w, h) display_reward_grid(win, rewards, w, h) display_mouse_grid(win, st, w, h)
def test_grid_q_learning(): pygame.init() w = 6 h = 5 rewards = ((24, 1), (2, -1), (11, -1), (27, -1)) terminal = [2, 11, 24] new_pos = {"top": 2, "bot": 3, "left": 0, "right": 1} S, A, T, P = create_grid_world(w, h, rewards, terminal) start_time = time() Q, Pi = tabular_q_learning_control(T, S, P, len(S), len(A), reset_line, is_terminal, step, episodes_count=10000, max_steps_per_episode=100) print("--- %s seconds ---" % (time() - start_time)) win = pygame.display.set_mode((w * 100, h * 100)) for i in range(w * h): if i % w == 0 and i != 0: print("") print(Pi[i], end=" ") st = reset_grid(w, h) while not is_terminal(st, T): display_grid(win, w, h) event_loop() display_reward_grid(win, rewards, w, h) display_mouse_grid(win, st, w, h) sleep(1) positions = {"top": st - w, "bot": st + w, "left": st - 1, "right": st + 1 } a = np.argmax(Q[st]) st, r, term = step(st, a, T, S, P) display_grid(win, w, h) display_reward_grid(win, rewards, w, h) display_mouse_grid(win, st, w, h)
def test_line_iterative_policy_evaluation(): pygame.init() num_states = 15 rewards = ((0, -1), (14, 1)) terminal = [0, 14] S, A, T, P = create_line_world(num_states, rewards, terminal) Pi = tabular_uniform_random_policy(S.shape[0], A.shape[0]) start_time = time() V = iterative_policy_evaluation(S, A, P, T, Pi) print("--- %s seconds ---" % (time() - start_time)) print(V) win = pygame.display.set_mode((num_states * 100, 100)) st = reset_line(num_states) while not is_terminal(st, T): display_line(win, num_states) event_loop() display_reward_line(win, rewards, num_states) display_mouse_line(win, st, num_states) sleep(1) if V[st + 1] > V[st - 1] or V[st + 1] == 0: a = 1 elif V[st + 1] < V[st - 1] or V[st - 1] == 0: a = 0 st, r, term = step(st, a, T, S, P) display_line(win, num_states) display_reward_line(win, rewards,num_states) display_mouse_line(win, st, num_states) sleep(1)
def test_line_sarsa(): pygame.init() num_states = 15 rewards = ((0, -1), (14, 1)) terminal = [0, 14] S, A, T, P = create_line_world(num_states, rewards, terminal) start_time = time() Q, Pi = tabular_sarsa_control(T, S, P, len(S), len(A), reset_line, is_terminal, step, episodes_count=10000, max_steps_per_episode=100) print("--- %s seconds ---" % (time() - start_time)) for i in range(num_states): print(Q[i], end=" ") win = pygame.display.set_mode((num_states * 100, 100)) st = reset_line(num_states) while not is_terminal(st, T): display_line(win, num_states) event_loop() display_reward_line(win, rewards, num_states) display_mouse_line(win, st, num_states) sleep(1) a = np.argmax(Q[st]) st, r, term = step(st, a, T, S, P) display_line(win, num_states) display_reward_line(win, rewards,num_states) display_mouse_line(win, st, num_states) sleep(1)
def test_tic_tac_q_learning(display=1): w = 3 h = 3 if display == 1: pygame.init() win = pygame.display.set_mode((w * 100, h * 100)) myfont = pygame.font.SysFont('Comic Sans MS', 30) s_terminal, s_sp, S, A = create_tic_tac(w, h) Q0, Pi0 = tabular_q_learning_control_2(s_terminal, s_sp, 0, len(S), len(A), reset_tic_tac, is_terminate_tic_tac, step_tic_tac, episodes_count=10000, max_steps_per_episode=100) # Q1, Pi1 = monte_carlo_with_exploring_starts_control_2(s_terminal, s_sp, 1, len(S), len(A), is_terminate_tic_tac, step_tic_tac, # episodes_count=10000, max_steps_per_episode=100) game_map = s_sp[0][0] state = s_sp[1][str(game_map)] is_terminal = False while (not is_terminal): if display == 1: display_grid_tic_tac(win, w, h) pygame.display.flip() event_loop() a = np.argmax(Q0[state]) state, r0, is_terminal, a = step_tic_tac(state, a, s_terminal, s_sp, 0) if display == 1: display_players(win, state, w, h, s_sp) sleep(0.3) if r0 == settings_tic_tac.reward_win: if display == 1: display_win(win, 0, state, s_sp) return 0 break elif r0 == settings_tic_tac.reward_lose: if display == 1: display_win(win, 1, state, s_sp) return 1 break elif r0 == settings_tic_tac.reward_draw: if display == 1: display_win(win, -1, state, s_sp) return -1 break a = np.random.choice(np.arange(9)) state, r1, is_terminal, a = step_tic_tac(state, a, s_terminal, s_sp, 1) # a = np.argmax(Q1[state]) # # print("J2 action " + str(a)) # print(Q1[state]) # state, r, is_terminal = step_tic_tac(state, a, s_terminal, s_sp, 1) if display == 1: display_players(win, state, w, h, s_sp) sleep(0.3) if r1 == settings_tic_tac.reward_win: if display == 1: display_win(win, 1, state, s_sp) return 1 break elif r1 == settings_tic_tac.reward_lose: if display == 1: display_win(win, 0, state, s_sp) return 0 break elif r1 == settings_tic_tac.reward_draw: if display == 1: display_win(win, -1, state, s_sp) return -1 break sleep(10)