def run_simulation(game_map: Map, Q: dict, epsilon_greedy=max_first): display = Display(game_map) game_map.reset() state = deepcopy(game_map.get_init_state()) run = True score = 0 display.render() while run: action = epsilon_greedy(Q, state, game_map.get_available_actions(state), 0.3, "softmax") state, _ = game_map.apply_action(action) run = not Display.is_close() and not game_map.is_final(score) display.render() Display.close_display() if game_map.win(): print("Game Win!!!") else: print("Game Lose!!!")
def sarsa(game_map, epsilon_greedy, Q={}, learning_rate=0.1, discovering_factor=0.9, epsilon=0.3, training_episode=1000, evaluation_episode=10, strategy="default", verbose=False): train_scores = [] eval_scores = [] initial_state = game_map.get_init_state() win_game = 0 q_values = [] number_cell_visited = [] if verbose: display = Display(game_map) for train_ep in range(1, training_episode + 1): game_map.reset() score = 0 state = deepcopy(initial_state) actions = game_map.get_available_actions_mouse() action = epsilon_greedy(Q, state, actions, epsilon, strategy) if verbose: from src.map import ACTIONS import sys print_q(Q, game_map.height, game_map.width, ACTIONS) display.render() sys.stdin.readline() if display.is_close(): sys.exit(0) while not game_map.is_final(score): # apply action and get the next state and the reward next_state, reward = game_map.apply_action(action) score += reward state_q_value, number_visited = Q.get((state, action), (0, 0)) action_next_state = epsilon_greedy( Q, next_state, game_map.get_available_actions(next_state), epsilon, strategy) next_state_q_value, _ = Q.get((next_state, action_next_state), (0, 0)) q_value = state_q_value + learning_rate * ( reward + discovering_factor * next_state_q_value - state_q_value) Q[(state, action)] = (q_value, number_visited + 1) state = next_state action = action_next_state if verbose: from src.map import ACTIONS import sys print_q(Q, game_map.height, game_map.width, ACTIONS) display.render() sys.stdin.readline() if display.is_close(): sys.exit(0) q_values.append(sum([x[Q_VALUE] for x in Q.values()])) win_game += int(game_map.win()) train_scores.append(score) number_cell_visited.append(len(Q.keys())) print("Episode {}/{}, score : {} win : {}".format( train_ep, training_episode, score, game_map.win())) if train_ep % evaluation_episode == 0: avg_score = mean(train_scores[train_ep - evaluation_episode:train_ep]) eval_scores.append(avg_score) win_game /= training_episode return Q, train_scores, eval_scores, win_game, q_values, number_cell_visited