def test_each_boundary_within_default_env(self): """ The agent follows a sequence of steps to check each boundary acts as expected (the current observation should be the same as the previous if you move into a boundary). The agent tries the top-left, top-right and bottom-right corners while avoiding the Terminal state. The step numbers where the agent ends up in the same state as previously are stored and then compared to the expected values and if exactly the same the test passes. """ env = GridUniverseEnv() # self.action_descriptors = ['up', 'right', 'down', 'left'] actions_to_take = [3, 0, 1, 1, 1, 1, 2, 2, 3, 2, 2, 3, 3, 3] boundary_test_step_numbers = [0, 1, 5, 10, 13] collected_boundary_step_numbers = [] prev_observation = env.reset() for step_no, action in enumerate(actions_to_take): env.render() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if observation == prev_observation: collected_boundary_step_numbers.append(step_no) prev_observation = observation print('collected_boundary_steps:', collected_boundary_step_numbers) print('boundary_test_steps', boundary_test_step_numbers) boolean_elementwise_comparison = [a == b for a, b in zip(collected_boundary_step_numbers, boundary_test_step_numbers)] print(boolean_elementwise_comparison) print(all(boolean_elementwise_comparison)) self.assertTrue(all(boolean_elementwise_comparison))
def test_lava(self): """ Run agent into lava, test to see if episode ends with negative reward """ env = GridUniverseEnv(lava_states=[1]) env.render() action = env.action_descriptor_to_int['RIGHT'] observation, reward, done, info = env.step(action) self.assertTrue(reward == -10 and done)
def test_griduniverse_wall_not_trespassed(self): """ Test whether agent is still in the same place after moving into a wall """ env = GridUniverseEnv(walls=[1]) env.render() action = 1 # go right observation, reward, done, info = env.step(action) print('go ' + env.action_descriptors[action]) env.render() self.assertTrue(observation == 0) # check if in same place
def run_monte_carlo_evaluation(): """ Run Monte Carlo evaluation on random policy and then act greedily with respect to the value function after the evaluation is complete """ print('\n' + '*' * 20 + 'Starting Monte Carlo evaluation and greedy policy' + '*' * 20 + '\n') world_shape = (8, 8) # env = GridUniverseEnv(grid_shape=world_shape) # Default GridUniverse env = GridUniverseEnv(world_shape, random_maze=True) policy0 = np.ones([env.world.size, env.action_space.n ]) / env.action_space.n print('Running an episode with a random agent (with initial policy)') st_history, rw_history, done = run_episode(policy0, env) print('Starting Monte-Carlo evaluation of random policy') value0 = monte_carlo_evaluation(policy0, env, every_visit=True, num_episodes=30) print(value0) # Create greedy policy from value function and run it on environment policy1 = utils.greedy_policy_from_value_function(policy0, env, value0) print(policy1) print('Policy: (up, right, down, left)\n', utils.get_policy_map(policy1, world_shape)) np.set_printoptions(linewidth=75, precision=8) print('Starting greedy policy episode') curr_state = env.reset() env.render_policy_arrows(policy1) for t in range(500): env.render(mode='graphic') action = np.argmax(policy1[curr_state]) print('go ' + env.action_descriptors[action]) curr_state, reward, done, info = env.step(action) if done: print('Terminal state found in {} steps'.format(t + 1)) env.render(mode='graphic') time.sleep(5) break
def test_lava_works_from_text_file(self): """ Test whether we can end the episode by making the agent travel into lava in environment created from text file """ env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/test_env.txt') actions_to_take = [env.action_descriptor_to_int[action_desc] for action_desc in ['DOWN', 'DOWN', 'DOWN', 'RIGHT', 'RIGHT']] for step_no, action in enumerate(actions_to_take): env.render() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(step_no + 1)) self.assertTrue(reward == -10 and done)
def test_custom_griduniverse_from_text_file(self): """ Test whether we can complete the GridUniverse created from the text file within """ env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/test_env.txt') actions_to_take = [2, 2, 2, 2, 2, 2, 2, 1] for step_no, action in enumerate(actions_to_take): env.render() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(step_no + 1)) self.assertTrue((step_no + 1) == len(actions_to_take) and done)
def test_large_griduniverse_completion_in_53_steps(self): """ Test whether the agent completes the a large rectangular GridUniverse in the expected 53 steps """ env = GridUniverseEnv(grid_shape=(25, 30)) actions_to_take = [1] * 24 + [2] * 29 # 24 steps right + 29 steps down num_actions = len(actions_to_take) print('Num actions to take to get to terminal state: {}'.format(num_actions)) for t in range(100): action = actions_to_take[t] observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break self.assertTrue((t + 1) == num_actions and done)
def run_griduniverse_with_lava(): """ Run a random agent on an environment with lava """ print('\n' + '*' * 20 + 'Starting to run random agent on default GridUniverse' + '*' * 20 + '\n') env = GridUniverseEnv(grid_shape=(10, 10), lava_states=[4, 14, 24, 34, 44, 54, 64, 74]) for i_episode in range(5): observation = env.reset() for t in range(100): env.render(mode='graphic') # set mode='graphic for pyglet render action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) print('Final states reward: ', reward) break
def test_default_griduniverse_completion_in_six_steps(self): """ Test whether the agent reaches a terminal state within the default square GridUniverse within six steps by going right 3 times and then going down 3 times. """ env = GridUniverseEnv() actions_to_take = [1, 1, 1, 2, 2, 2] # 3 rights and 3 downs for t in range(100): env.render() action = actions_to_take[t] print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) self.assertTrue((t + 1) == 6) break
def run_default_griduniverse(): """ Run a random agent on the default griduniverse. This piece of code shows the main interface to the environment. This runs in ascii format """ print('\n' + '*' * 20 + 'Starting to run random agent on default GridUniverse' + '*' * 20 + '\n') env = GridUniverseEnv() for i_episode in range(1): observation = env.reset() for t in range(100): env.render() # set mode='graphic for pyglet render action = env.action_space.sample() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
def run_griduniverse_from_text_file(): """ Run a random agent on an environment that was save via ascii text file. Check core/envs/maze_text_files for examples or the _create_custom_world_from_text() function within the environment """ print('\n' + '*' * 20 + 'Creating a pre-made GridUniverse from text file and running random agent on it' + '*' * 20 + '\n') env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/test_env.txt') # env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/maze_21x21.txt') # env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/maze_101x101.txt') for i_episode in range(1): observation = env.reset() for t in range(1000): env.render(mode='graphic') action = env.action_space.sample() # print('go ' + env.action_descriptors[action]) # time.sleep(0.1) # uncomment to watch slower observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
def run_random_maze(): """ Run a random agent on a randomly generated maze. If random_maze parameter is set to True, a maze generation algorithm will place walls to form the maze in the requested shape. """ print('\n' + '*' * 20 + 'Creating a random GridUniverse and running random agent on it' + '*' * 20 + '\n') env = GridUniverseEnv(grid_shape=(11, 11), random_maze=True) # env = GridUniverseEnv(grid_shape=(101, 101), random_maze=True) # env = GridUniverseEnv(grid_shape=(49, 51), random_maze=True) # env = GridUniverseEnv(grid_shape=(51, 49), random_maze=True) for i_episode in range(1): observation = env.reset() for t in range(1000): env.render(mode='graphic') env.step_num = t action = env.action_space.sample() # print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
root_vertex = env.initial_state # todo not same as x # depth_first_search_recursive(nodes_and_edges, root_vertex) # node_path_to_terminal = depth_first_search_iterative(nodes_and_edges, root_vertex) print('Initial states edges:', nodes_and_edges[root_vertex]) action_list_to_terminal = breadth_first_search(nodes_and_edges, root_vertex) print('Initial state:', root_vertex) print('Path to terminal:', action_list_to_terminal) # for i_episode in range(2): for i_episode in range(1): observation = env.reset() for step_num, action in enumerate(action_list_to_terminal): env.render(mode='graphic') if first_time: time.sleep(5) first_time = False # env.render() # time.sleep(0.2) # print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done or step_num == len(action_list_to_terminal) - 1: env.render(mode='graphic') env.render() time.sleep(1) print( "Episode finished after {} timesteps. Done: {}".format( step_num + 1, done)) break
def run_policy_and_value_iteration(): """ Majority of code is within utils.py and dynamic_programming.py for this function This function does 4 things: 1. Evaluate the value function of a random policy a number of times 2. Create a greedy policy created from from this value function 3. Run Policy Iteration 4. Run Value Iteration 5. Run agent on environment on policy found from Value Iteration """ print('\n' + '*' * 20 + 'Starting value and policy iteration' + '*' * 20 + '\n') # 1. Evaluate the value function of a random policy a number of times world_shape = (4, 4) # env = GridUniverseEnv(grid_shape=world_shape, goal_states=[3, 12]) # Sutton and Barlo/David Silver example # specific case with lava and path true it # env = GridUniverseEnv(grid_shape=world_shape, lava_states=[i for i in range(15) if i not in [0, 4, 8, 12, 13, 14, 15]]) world_shape = (11, 11) env = GridUniverseEnv(grid_shape=world_shape, random_maze=True) policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) v0 = np.zeros(env.world.size) val_fun = v0 for k in range(500): val_fun = utils.single_step_policy_evaluation(policy0, env, value_function=val_fun) print(utils.reshape_as_griduniverse(val_fun, world_shape)) # 2. Create a greedy policy created from from this value function policy1 = utils.greedy_policy_from_value_function(policy0, env, val_fun) policy_map1 = utils.get_policy_map(policy1, world_shape) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', policy_map1) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(policy1, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 3. Run Policy Iteration print('Policy iteration:') policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) optimal_value, optimal_policy = dp.policy_iteration(policy0, env, v0, threshold=0.001, max_steps=1000) print('Value:\n', utils.reshape_as_griduniverse(optimal_value, world_shape)) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 4. Run Value Iteration print('Value iteration:') policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) optimal_value, optimal_policy = dp.value_iteration(policy0, env, v0, threshold=0.001, max_steps=100) print('Value:\n', utils.reshape_as_griduniverse(optimal_value, world_shape)) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 5. Run agent on environment on policy found from Value Iteration print('Starting to run agent on environment with optimal policy') curr_state = env.reset() env.render_policy_arrows(optimal_policy) # Dynamic programming doesn't necessarily have the concept of an agent. # But you can create an agent to run on the environment using the found policy for t in range(100): env.render(mode='graphic') action = np.argmax(optimal_policy[curr_state]) print('go ' + env.action_descriptors[action]) curr_state, reward, done, info = env.step(action) if done: print('Terminal state reached in {} steps'.format(t + 1)) env.render( mode='graphic') # must render here to see agent in final state time.sleep(6) env.render(close=True) break