Example #1
0
def test_state_action_pair_cond():
    n_sa = [[[30, 30, 30, 30],
             [24, 21, 14, 16],
             [15, 16, 10, 11],
             [12, 8, 10, 10]],

            [[23, 30, 22, 30],
             [0,  0,  0,  0],
             [5,  3,  3,  7],
             [3,  3,  0,  3]],

            [[17, 11, 17, 11],
             [8,  9, 11, 12],
             [5,  1,  0,  5],
             [2,  0,  0,  0]]]

    n_sa2 = [[[330, 330, 330, 330],
             [324, 321, 314, 316],
             [315, 316, 310, 311],
             [312, 338, 130, 310]],

            [[330, 330, 330, 330],
             [324, 321, 314, 316],
             [315, 316, 310, 311],
             [312, 338, 130, 310]],

            [[330, 330, 330, 330],
             [324, 321, 314, 316],
             [315, 316, 310, 311],
             [312, 338, 130, 310]]]
             
    print(state_action_pair_cond(rw.read_grid("lecture"), n_sa))
    print(state_action_pair_cond(rw.read_grid("lecture"), n_sa2))
Example #2
0
def find_optimal_policy(world):
    # initialize variables
    curr_state = (0, 0)
    utils = [[0.0] * num_cols for i in range(num_rows)]  # s
    n_sa = np.array([[[0] * num_moves for i in range(num_cols)]
            for i in range(num_rows)])  # s by a
    n_sas = [[[[[0] * num_cols for i in range(num_rows)]
            for i in range(num_moves)] for i in range(num_cols)] for i in range(num_rows)]  # s by a by s'
    grid = rw.read_grid(world)

    iterations = 0
    util_updated = False

    # exit when each state-pair done at least N_e times and util stablizes
    while (state_action_pair_cond(grid, n_sa) or util_updated):
        iterations += 1
        # 2. decide on what the best action would be
        best_dir = get_best_dir(grid, utils, curr_state, n_sa, n_sas)

        # now make that move
        next_state = rw.make_move(grid, curr_state, best_dir, world)
        # print(f"choose to make move from {curr_state} in direction {best_dir}, next state is {next_state}")

        # 3. update n_sa and n_sas
        n_sa[curr_state[0]][curr_state[1]][best_dir] += 1
        n_sas[curr_state[0]][curr_state[1]
                             ][best_dir][next_state[0]][next_state[1]] += 1
        assert n_sa[curr_state[0]][curr_state[1]][best_dir] >= n_sas[curr_state[0]][curr_state[1]][best_dir][next_state[0]][next_state[1]], \
            f"After updating, n_sa[{curr_state[0]}][{curr_state[1]}][{best_dir}] was {n_sa[curr_state[0]][curr_state[1]][best_dir]}, {n_sas[curr_state[0]][curr_state[1]][best_dir][next_state[0]][next_state[1]]}"

        # 4. update utils based on new n_sa and n_sas
        util_updated = update_utils(world, grid, utils, n_sa, n_sas, rw.get_gamma(world))

        # reset this trial if we reach goal state
        if rw.is_goal(grid, next_state):
            curr_state = (0, 0)
        else:
            curr_state = next_state
        
        # print(state_action_pair_cond(grid, n_sa), util_updated)

    # debug print statements
    # print("n_sa:\n", n_sa)
    # print("num iterations:", iterations)

    print("Final utility values for", world, ":")
    print("[", end='')
    for i, row in enumerate(utils):
        print("[", end='')
        for j, val in enumerate(row):
            print("{:.3f}".format(val), end=' ')
        print("]") if i != 2 else print("]]")
     
    print("Optimal policy for", world, ":")
    utils_to_policy(grid, utils, n_sa, n_sas)
    print("")
Example #3
0
def test_get_reward_by_state(world):
    print(rw.read_grid(world))
    print(get_reward_by_state(world, rw.read_grid(world), (0, 0)))
    print(get_reward_by_state(world, rw.read_grid(world), (0, 1)))
    print(get_reward_by_state(world, rw.read_grid(world), (1, 1)))
    print(get_reward_by_state(world, rw.read_grid(world), (1, 3)))
    print(get_reward_by_state(world, rw.read_grid(world), (2, 3)))
Example #4
0
def provided_helpers(world: str):
    grid = rw.read_grid(world)
    print("next states:", rw.get_next_states(grid, (0, 0)))
    print("is_goal:", rw.is_goal(grid, (0, 0)))
    print(rw.get_next_states(grid, (1, 1)))
Example #5
0
def dump_world_info(world: str):
    print(f"discount factor of {world}:", rw.get_gamma(world))
    print(
        f"immediate reward of non-goal state of {world}:", rw.get_reward(world))
    print(f"grid for {world}:\n", rw.read_grid(world))