Ejemplo n.º 1
0
def execute_episode():
    history = []

    for state in range(N * M):
        action = e_greedy(state,
                          get_valid_actions(maze, state, ACTIONS),
                          Q,
                          epsilon=0.1)

        history.append((state, action))

        R = reward(maze, state)

        next_state = get_next_state(maze, state, action)
        next_best = max(q_table[next_state])

        td_target = R + DISCOUNT * next_best
        td_diff = td_target - q_table[state][action]

        q_table[state][action] += LEARNING_RATE * td_diff
        q_table[state][action] = round(q_table[state][action], 3)

        model[state][action] = (next_state, R)

        for _ in range(PLANNING):
            state, action = random.choice(history)
            next_state, R = model[state][action]
            next_best = max(q_table[next_state])

            td_target = R + DISCOUNT * next_best
            td_diff = td_target - q_table[state][action]

            q_table[state][action] += LEARNING_RATE * td_diff
            q_table[state][action] = round(q_table[state][action], 3)
def get_path():
    i = 0
    path = [i]

    while i != N * M - 1:
        i = get_next_state(maze, i, ACTIONS[np.argmax(q_table[i])])
        path.append(i)

    return path
Ejemplo n.º 3
0
def reward(maze, state, action):
    next_state = get_next_state(maze, state, action)
    x, y = state_to_cell(maze, state)

    if maze.grid[x][y].neighbors[ACTION_MAPPER[action]] == INF:
        return -10000
    elif next_state == (maze.num_rows * maze.num_columns - 1):
        return 1000
    else:
        return 0
Ejemplo n.º 4
0
def execute_episode():
    for state in range(N * M):
        action = e_greedy(state,
                          get_valid_actions(maze, state, ACTIONS),
                          Q,
                          epsilon=0.1)
        R = reward(maze, state)
        next_state = get_next_state(maze, state, action)

        next_best = max(q_table[next_state])

        td_target = R + DISCOUNT * next_best
        td_diff = td_target - q_table[state][action]

        q_table[state][action] += LEARNING_RATE * td_diff
        q_table[state][action] = round(q_table[state][action], 3)
Ejemplo n.º 5
0
def get_path():
    i = N * M - 2
    path = [i]
    visited = set()
    while i:
        temp = []
        for action, value in enumerate(q_table[i]):
            temp.append((value, action))
        temp.sort()
        for item in temp:
            try:
                new_state = get_next_state(maze, i, item[1])

                if new_state not in visited:
                    i = new_state
                    path.append(i)
                    break
            except:
                continue

    return path