def test_each_boundary_within_default_env(self): """ The agent follows a sequence of steps to check each boundary acts as expected (the current observation should be the same as the previous if you move into a boundary). The agent tries the top-left, top-right and bottom-right corners while avoiding the Terminal state. The step numbers where the agent ends up in the same state as previously are stored and then compared to the expected values and if exactly the same the test passes. """ env = GridUniverseEnv() # self.action_descriptors = ['up', 'right', 'down', 'left'] actions_to_take = [3, 0, 1, 1, 1, 1, 2, 2, 3, 2, 2, 3, 3, 3] boundary_test_step_numbers = [0, 1, 5, 10, 13] collected_boundary_step_numbers = [] prev_observation = env.reset() for step_no, action in enumerate(actions_to_take): env.render() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if observation == prev_observation: collected_boundary_step_numbers.append(step_no) prev_observation = observation print('collected_boundary_steps:', collected_boundary_step_numbers) print('boundary_test_steps', boundary_test_step_numbers) boolean_elementwise_comparison = [a == b for a, b in zip(collected_boundary_step_numbers, boundary_test_step_numbers)] print(boolean_elementwise_comparison) print(all(boolean_elementwise_comparison)) self.assertTrue(all(boolean_elementwise_comparison))
def run_monte_carlo_evaluation(): """ Run Monte Carlo evaluation on random policy and then act greedily with respect to the value function after the evaluation is complete """ print('\n' + '*' * 20 + 'Starting Monte Carlo evaluation and greedy policy' + '*' * 20 + '\n') world_shape = (8, 8) # env = GridUniverseEnv(grid_shape=world_shape) # Default GridUniverse env = GridUniverseEnv(world_shape, random_maze=True) policy0 = np.ones([env.world.size, env.action_space.n ]) / env.action_space.n print('Running an episode with a random agent (with initial policy)') st_history, rw_history, done = run_episode(policy0, env) print('Starting Monte-Carlo evaluation of random policy') value0 = monte_carlo_evaluation(policy0, env, every_visit=True, num_episodes=30) print(value0) # Create greedy policy from value function and run it on environment policy1 = utils.greedy_policy_from_value_function(policy0, env, value0) print(policy1) print('Policy: (up, right, down, left)\n', utils.get_policy_map(policy1, world_shape)) np.set_printoptions(linewidth=75, precision=8) print('Starting greedy policy episode') curr_state = env.reset() env.render_policy_arrows(policy1) for t in range(500): env.render(mode='graphic') action = np.argmax(policy1[curr_state]) print('go ' + env.action_descriptors[action]) curr_state, reward, done, info = env.step(action) if done: print('Terminal state found in {} steps'.format(t + 1)) env.render(mode='graphic') time.sleep(5) break
def run_griduniverse_with_lava(): """ Run a random agent on an environment with lava """ print('\n' + '*' * 20 + 'Starting to run random agent on default GridUniverse' + '*' * 20 + '\n') env = GridUniverseEnv(grid_shape=(10, 10), lava_states=[4, 14, 24, 34, 44, 54, 64, 74]) for i_episode in range(5): observation = env.reset() for t in range(100): env.render(mode='graphic') # set mode='graphic for pyglet render action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) print('Final states reward: ', reward) break
def run_default_griduniverse(): """ Run a random agent on the default griduniverse. This piece of code shows the main interface to the environment. This runs in ascii format """ print('\n' + '*' * 20 + 'Starting to run random agent on default GridUniverse' + '*' * 20 + '\n') env = GridUniverseEnv() for i_episode in range(1): observation = env.reset() for t in range(100): env.render() # set mode='graphic for pyglet render action = env.action_space.sample() print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
def run_griduniverse_from_text_file(): """ Run a random agent on an environment that was save via ascii text file. Check core/envs/maze_text_files for examples or the _create_custom_world_from_text() function within the environment """ print('\n' + '*' * 20 + 'Creating a pre-made GridUniverse from text file and running random agent on it' + '*' * 20 + '\n') env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/test_env.txt') # env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/maze_21x21.txt') # env = GridUniverseEnv(custom_world_fp='../core/envs/maze_text_files/maze_101x101.txt') for i_episode in range(1): observation = env.reset() for t in range(1000): env.render(mode='graphic') action = env.action_space.sample() # print('go ' + env.action_descriptors[action]) # time.sleep(0.1) # uncomment to watch slower observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
def run_random_maze(): """ Run a random agent on a randomly generated maze. If random_maze parameter is set to True, a maze generation algorithm will place walls to form the maze in the requested shape. """ print('\n' + '*' * 20 + 'Creating a random GridUniverse and running random agent on it' + '*' * 20 + '\n') env = GridUniverseEnv(grid_shape=(11, 11), random_maze=True) # env = GridUniverseEnv(grid_shape=(101, 101), random_maze=True) # env = GridUniverseEnv(grid_shape=(49, 51), random_maze=True) # env = GridUniverseEnv(grid_shape=(51, 49), random_maze=True) for i_episode in range(1): observation = env.reset() for t in range(1000): env.render(mode='graphic') env.step_num = t action = env.action_space.sample() # print('go ' + env.action_descriptors[action]) observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
from queue import Queue # import queue from core.envs.griduniverse_env import GridUniverseEnv # todo: could rename file to path_finding.py if __name__ == '__main__': print('\n' + '*' * 20 + 'Creating a random GridUniverse and running random agent on it' + '*' * 20 + '\n') first_time = True for i in range(10): env = GridUniverseEnv(grid_shape=(15, 15), random_maze=True) curr_state = initial_state = env.reset() actions = range(4) all_valid_states = [] visited = {} nodes_and_edges = {} # https: // en.wikipedia.org / wiki / A * _search_algorithm # def find_all_neighbouring_states(curr_state): # nodes_and_edges[curr_state] = [] # for action in actions: # # next_states = [] # next_state, reward, done = env.look_step_ahead(curr_state, action)
def run_policy_and_value_iteration(): """ Majority of code is within utils.py and dynamic_programming.py for this function This function does 4 things: 1. Evaluate the value function of a random policy a number of times 2. Create a greedy policy created from from this value function 3. Run Policy Iteration 4. Run Value Iteration 5. Run agent on environment on policy found from Value Iteration """ print('\n' + '*' * 20 + 'Starting value and policy iteration' + '*' * 20 + '\n') # 1. Evaluate the value function of a random policy a number of times world_shape = (4, 4) # env = GridUniverseEnv(grid_shape=world_shape, goal_states=[3, 12]) # Sutton and Barlo/David Silver example # specific case with lava and path true it # env = GridUniverseEnv(grid_shape=world_shape, lava_states=[i for i in range(15) if i not in [0, 4, 8, 12, 13, 14, 15]]) world_shape = (11, 11) env = GridUniverseEnv(grid_shape=world_shape, random_maze=True) policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) v0 = np.zeros(env.world.size) val_fun = v0 for k in range(500): val_fun = utils.single_step_policy_evaluation(policy0, env, value_function=val_fun) print(utils.reshape_as_griduniverse(val_fun, world_shape)) # 2. Create a greedy policy created from from this value function policy1 = utils.greedy_policy_from_value_function(policy0, env, val_fun) policy_map1 = utils.get_policy_map(policy1, world_shape) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', policy_map1) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(policy1, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 3. Run Policy Iteration print('Policy iteration:') policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) optimal_value, optimal_policy = dp.policy_iteration(policy0, env, v0, threshold=0.001, max_steps=1000) print('Value:\n', utils.reshape_as_griduniverse(optimal_value, world_shape)) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 4. Run Value Iteration print('Value iteration:') policy0 = np.ones([env.world.size, len(env.action_state_to_next_state)]) / len( env.action_state_to_next_state) optimal_value, optimal_policy = dp.value_iteration(policy0, env, v0, threshold=0.001, max_steps=100) print('Value:\n', utils.reshape_as_griduniverse(optimal_value, world_shape)) print('Policy: (0=up, 1=right, 2=down, 3=left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75 * 2, precision=4) print('Policy: (up, right, down, left)\n', utils.get_policy_map(optimal_policy, world_shape)) np.set_printoptions(linewidth=75, precision=8) # 5. Run agent on environment on policy found from Value Iteration print('Starting to run agent on environment with optimal policy') curr_state = env.reset() env.render_policy_arrows(optimal_policy) # Dynamic programming doesn't necessarily have the concept of an agent. # But you can create an agent to run on the environment using the found policy for t in range(100): env.render(mode='graphic') action = np.argmax(optimal_policy[curr_state]) print('go ' + env.action_descriptors[action]) curr_state, reward, done, info = env.step(action) if done: print('Terminal state reached in {} steps'.format(t + 1)) env.render( mode='graphic') # must render here to see agent in final state time.sleep(6) env.render(close=True) break