def execute_episode(): history = [] for state in range(N * M): action = e_greedy(state, get_valid_actions(maze, state, ACTIONS), Q, epsilon=0.1) history.append((state, action)) R = reward(maze, state) next_state = get_next_state(maze, state, action) next_best = max(q_table[next_state]) td_target = R + DISCOUNT * next_best td_diff = td_target - q_table[state][action] q_table[state][action] += LEARNING_RATE * td_diff q_table[state][action] = round(q_table[state][action], 3) model[state][action] = (next_state, R) for _ in range(PLANNING): state, action = random.choice(history) next_state, R = model[state][action] next_best = max(q_table[next_state]) td_target = R + DISCOUNT * next_best td_diff = td_target - q_table[state][action] q_table[state][action] += LEARNING_RATE * td_diff q_table[state][action] = round(q_table[state][action], 3)
def get_path(): i = 0 path = [i] while i != N * M - 1: i = get_next_state(maze, i, ACTIONS[np.argmax(q_table[i])]) path.append(i) return path
def reward(maze, state, action): next_state = get_next_state(maze, state, action) x, y = state_to_cell(maze, state) if maze.grid[x][y].neighbors[ACTION_MAPPER[action]] == INF: return -10000 elif next_state == (maze.num_rows * maze.num_columns - 1): return 1000 else: return 0
def execute_episode(): for state in range(N * M): action = e_greedy(state, get_valid_actions(maze, state, ACTIONS), Q, epsilon=0.1) R = reward(maze, state) next_state = get_next_state(maze, state, action) next_best = max(q_table[next_state]) td_target = R + DISCOUNT * next_best td_diff = td_target - q_table[state][action] q_table[state][action] += LEARNING_RATE * td_diff q_table[state][action] = round(q_table[state][action], 3)
def get_path(): i = N * M - 2 path = [i] visited = set() while i: temp = [] for action, value in enumerate(q_table[i]): temp.append((value, action)) temp.sort() for item in temp: try: new_state = get_next_state(maze, i, item[1]) if new_state not in visited: i = new_state path.append(i) break except: continue return path