def test_state_action_pair_cond(): n_sa = [[[30, 30, 30, 30], [24, 21, 14, 16], [15, 16, 10, 11], [12, 8, 10, 10]], [[23, 30, 22, 30], [0, 0, 0, 0], [5, 3, 3, 7], [3, 3, 0, 3]], [[17, 11, 17, 11], [8, 9, 11, 12], [5, 1, 0, 5], [2, 0, 0, 0]]] n_sa2 = [[[330, 330, 330, 330], [324, 321, 314, 316], [315, 316, 310, 311], [312, 338, 130, 310]], [[330, 330, 330, 330], [324, 321, 314, 316], [315, 316, 310, 311], [312, 338, 130, 310]], [[330, 330, 330, 330], [324, 321, 314, 316], [315, 316, 310, 311], [312, 338, 130, 310]]] print(state_action_pair_cond(rw.read_grid("lecture"), n_sa)) print(state_action_pair_cond(rw.read_grid("lecture"), n_sa2))
def find_optimal_policy(world): # initialize variables curr_state = (0, 0) utils = [[0.0] * num_cols for i in range(num_rows)] # s n_sa = np.array([[[0] * num_moves for i in range(num_cols)] for i in range(num_rows)]) # s by a n_sas = [[[[[0] * num_cols for i in range(num_rows)] for i in range(num_moves)] for i in range(num_cols)] for i in range(num_rows)] # s by a by s' grid = rw.read_grid(world) iterations = 0 util_updated = False # exit when each state-pair done at least N_e times and util stablizes while (state_action_pair_cond(grid, n_sa) or util_updated): iterations += 1 # 2. decide on what the best action would be best_dir = get_best_dir(grid, utils, curr_state, n_sa, n_sas) # now make that move next_state = rw.make_move(grid, curr_state, best_dir, world) # print(f"choose to make move from {curr_state} in direction {best_dir}, next state is {next_state}") # 3. update n_sa and n_sas n_sa[curr_state[0]][curr_state[1]][best_dir] += 1 n_sas[curr_state[0]][curr_state[1] ][best_dir][next_state[0]][next_state[1]] += 1 assert n_sa[curr_state[0]][curr_state[1]][best_dir] >= n_sas[curr_state[0]][curr_state[1]][best_dir][next_state[0]][next_state[1]], \ f"After updating, n_sa[{curr_state[0]}][{curr_state[1]}][{best_dir}] was {n_sa[curr_state[0]][curr_state[1]][best_dir]}, {n_sas[curr_state[0]][curr_state[1]][best_dir][next_state[0]][next_state[1]]}" # 4. update utils based on new n_sa and n_sas util_updated = update_utils(world, grid, utils, n_sa, n_sas, rw.get_gamma(world)) # reset this trial if we reach goal state if rw.is_goal(grid, next_state): curr_state = (0, 0) else: curr_state = next_state # print(state_action_pair_cond(grid, n_sa), util_updated) # debug print statements # print("n_sa:\n", n_sa) # print("num iterations:", iterations) print("Final utility values for", world, ":") print("[", end='') for i, row in enumerate(utils): print("[", end='') for j, val in enumerate(row): print("{:.3f}".format(val), end=' ') print("]") if i != 2 else print("]]") print("Optimal policy for", world, ":") utils_to_policy(grid, utils, n_sa, n_sas) print("")
def test_get_reward_by_state(world): print(rw.read_grid(world)) print(get_reward_by_state(world, rw.read_grid(world), (0, 0))) print(get_reward_by_state(world, rw.read_grid(world), (0, 1))) print(get_reward_by_state(world, rw.read_grid(world), (1, 1))) print(get_reward_by_state(world, rw.read_grid(world), (1, 3))) print(get_reward_by_state(world, rw.read_grid(world), (2, 3)))
def provided_helpers(world: str): grid = rw.read_grid(world) print("next states:", rw.get_next_states(grid, (0, 0))) print("is_goal:", rw.is_goal(grid, (0, 0))) print(rw.get_next_states(grid, (1, 1)))
def dump_world_info(world: str): print(f"discount factor of {world}:", rw.get_gamma(world)) print( f"immediate reward of non-goal state of {world}:", rw.get_reward(world)) print(f"grid for {world}:\n", rw.read_grid(world))