Esempio n. 1
0
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))

    return states_and_rewards


if __name__ == '__main__':
    grid = standard_grid()

    print("Rewards: ")
    print_values(grid.rewards, grid)

    states = grid.all_states()

    #initialize Policy
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }

def max_dict(d):
    max_key = None
    max_value = float('-inf')
    for k, v in d.items():
        if v > max_value:
            max_value = v
            max_key = k
    return max_key, max_value


if __name__ == '__main__':
    grid = negative_grid(step_cost=-0.1)
    print('Rewards: ')
    print_values(grid.rewards, grid)

    states = grid.all_states()

    #initialize Q
    Q = {}
    for s in states:
        Q[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            Q[s][a] = 0

    #initialize Count
    update_counts = {}
    update_counts_sa = {}
    for s in states:
        update_counts_sa[s] = {}