def main(problem_id, map_name_base): 
    #random agent derived from lochlomond_demo.py provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow
    if(problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Probleam ID should be between 0 and 7")
    
    if(map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base 
    else:
        print("Map base can be 8x8-base or 4x4-base")
    
    reward_hole = 0.0     
    is_stochastic = True  
    EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"])
    max_episodes = 10000  
    max_iter_per_episode = 1000 
    
    #generate the specific problem
    env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole)

    env.action_space.sample() 


    print(env.desc)

    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

   
    np.random.seed(12)
    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.zeros(max_episodes))

    for e in range(max_episodes): 
        observation = env.reset()      

        for iter in range(max_iter_per_episode):      
          action = env.action_space.sample() #The agent goes here
          observation, reward, done, info = env.step(action) 

          stats.episode_rewards[e] += reward #collect useful stats for comparison and plotting
          stats.episode_lengths[e] = iter
          
          if(done and reward==reward_hole): 
              print("We have reached a hole :-( [we can't move so stop trying; just give up... and perhaps restart]")
              break

          if (done and reward == +1.0):
              #env.render()     
              print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]")
              break

    return (stats)
Ejemplo n.º 2
0
def main(problemID, mapID):
    problem = int(problemID)
    rewardHole = -0.02
    stochastic = True
    trainingEpisodes = 35000
    episodes = 1000
    iterPerEpisode = 2000
    mapBase = mapID
    np.random.seed(12)
    successes = 0  # records the number of successes
    totalReward = 0
    stats = {"episodes": {}}

    # set up the environment
    env = LochLomondEnv(problem_id=problem,
                        is_stochastic=stochastic,
                        map_name_base=mapBase,
                        reward_hole=rewardHole)

    qTable = generate_q(env, trainingEpisodes, iterPerEpisode)

    print("___________________________________")
    print("Training Finished")
    print("Attempting to find solution...")

    for episode in range(episodes):
        # initial params
        state = env.reset()
        step = 0
        done = False
        reward = 0
        for step in range(iterPerEpisode):
            action = np.argmax(qTable[state, :])  # take the best action
            nextState, reward, done, info = env.step(action)
            if done:
                stats["episodes"][episode] = {"steps": step, "reward": reward}
                if (reward == 1.0):
                    successes += 1
                totalReward += reward
                break
        state = nextState

    successRate = ((successes / episodes) * 100)
    print("___________________________________")
    print("Finished")
    print("Success Rate: " + str(successRate) + "%")
    print("Total Reward: " + str(totalReward))
    # log stats
    stats["successrate"] = successRate
    stats["totalreward"] = totalReward
    stats["qtable"] = qTable
    return stats, qTable
Ejemplo n.º 3
0
def run(problem_id=0,
        max_episodes=10000,
        max_iters_per=2000,
        reward_hole=-1.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)

    epsilon = 0.9
    lr_rate = 0.81
    gamma = 0.96
    epsilon_reduce = 1 / max_episodes

    Q = np.zeros((env.observation_space.n, env.action_space.n))

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        state = env.reset()
        print('-' * 50)
        print_headers()

        for iter in range(max_iters_per):
            action = choose_action(state, epsilon, Q, env)
            state2, reward, done, info = env.step(action)
            print(",".join([
                str(episode),
                str(iter),
                str(reward),
                str(done),
                str(info),
                str(action)
            ]))

            learn(state, state2, reward, action, Q, gamma, lr_rate)
            state = state2
            if done and reward == reward_hole:
                print('Found a hole in ' + str(iter) + ' iterations')
                results.append({'iters': iter, 'success': False})
                break
            if done:
                print('Found frisbee in ' + str(iter) + ' iterations')
                results.append({'iters': iter, 'success': True})
                break

        epsilon -= epsilon_reduce

    return results
Ejemplo n.º 4
0
def generate_grids(cols):
    grids = []
    for i in range(cols):
        map_name_base = '{}x{}-base'.format(cols, cols)
        env = LochLomondEnv(problem_id=i,
                            is_stochastic=True,
                            reward_hole=-0.02,
                            map_name_base=map_name_base)

        env.render()
        grid = EnvMDP.to_decoded(env).reshape(env.nrow * env.ncol)
        grids.append(np.hstack(([i], grid)))

    return grids
Ejemplo n.º 5
0
def main(problemID, mapID):
    problem = int(problemID)
    reward_hole = -1.0
    stochastic = False
    episodes = 100
    mapBase = mapID
    stats = {}

    # start from a known seed
    np.random.seed(12)

    # set up the environment
    env = LochLomondEnv(problem_id=problem,
                        is_stochastic=stochastic,
                        map_name_base=mapBase,
                        reward_hole=reward_hole)

    state_space_locations, state_space_actions, state_initial_id, \
        state_goal_id = env2statespace(env)

    # Insert the solution here to find and output the solution using A-star
    # define the states and actions in a table
    maze_map = search.UndirectedGraph(state_space_actions)
    maze_map.locations = state_space_locations

    maze_problem = search.GraphProblem(state_initial_id, state_goal_id,
                                       maze_map)

    for episode in range(episodes):  # iterate over episodes
        env.reset()  # reset the state of the env to the starting state
        iterations, node = my_astar_search_graph(problem=maze_problem, h=None)
        # -- Trace the solution --#
        solution_path = [node]
        cnode = node.parent
        solution_path.append(cnode)
        while cnode.state != state_initial_id:
            cnode = cnode.parent
            solution_path.append(cnode)

        print("----------------------------------------")
        print("Identified goal state:" + str(solution_path[0]))
        print("Solution trace:" + str(solution_path))
        print("Iterations:" + str(iterations))
        print("----------------------------------------")
        # log stats
    stats["solutiontrace"] = str(solution_path)
    stats["numberofiterations"] = str(iterations)

    return stats
Ejemplo n.º 6
0
    def test_env_2_transitions(self):
        env = LochLomondEnv(problem_id=1, is_stochastic=True, 
                            reward_hole=-0.2, map_name_base="4x4-base")

        mdp = EnvMDP(env)
        transitions = EnvMDP.to_transitions(env)
        # transitions[current_pos][action] = [(prob, newstate)]

        # moving to the left should...
        ## move to the bottom with 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][0][2][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][0][2][1], (0, 1))

        ## stay 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][0][1][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][0][1][1], (0, 0))

        ## stay 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][0][0][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][0][0][1], (0, 0))

        # moving to the down should...
        ## stay 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][1][0][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][1][0][1], (0, 0))

        ## move to the bottom with 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][1][1][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][1][1][1], (0, 1))

        ## move to the right with 0.333 prob
        self.assertAlmostEqual(transitions[(0, 0)][1][2][0], 0.333, places=3)
        self.assertEqual(transitions[(0, 0)][1][2][1], (1, 0))
Ejemplo n.º 7
0
    def test_env_2_init(self):
        env = LochLomondEnv(problem_id=1, is_stochastic=True, 
                            reward_hole=-0.2, map_name_base="4x4-base")

        mdp = EnvMDP(env)
        initial = EnvMDP.to_position(env, letter=b'S')
        self.assertEqual((1, 0), initial[0])
Ejemplo n.º 8
0
def eviornment():
    # Setup the parameters for the specific problem (you can change all of these if you want to)
    problem_id = int(
        sys.argv[1]
    )  # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = 0.0  # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
    is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

    max_episodes = 2000  # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all!
    max_iter_per_episode = 500  # you decide how many iterations/actions can be executed per episode

    observation_list = list()
    reward_list = list()

    # Generate the specific problem
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=False,
                        reward_hole=reward_hole)

    # Let's visualize the problem/env
    print('env', env.desc)

    # Reset the random generator to a known state (for reproducability)
    np.random.seed(12)
    return max_episodes, env, max_iter_per_episode, observation_list, reward_list
Ejemplo n.º 9
0
def main(problemID, mapID):
    problem = int(problemID)
    reward_hole = -1.0
    stochastic = True
    episodes = 1000
    iterPerEpisode = 2000
    mapBase = mapID
    successes = 0  # records the number of successes
    stats = {"episodes": {}}
    totalReward = 0  # reward per episode

    # set up the environment
    env = LochLomondEnv(problem_id=problem,
                        is_stochastic=stochastic,
                        map_name_base=mapBase,
                        reward_hole=reward_hole)

    np.random.seed(12)

    for episode in range(episodes):  # iterate over episodes
        print("___________________________________")
        print("EPISODE: " + str(episode))
        observation = env.reset(
        )  # reset the state of the env to the starting state

        reward = 0

        for step in range(iterPerEpisode):
            action = env.action_space.sample(
            )  # your agent goes here (the current agent takes random actions)
            observation, reward, done, info = env.step(
                action)  # observe what happends when you take the action
            # Check if we are done and monitor rewards etc...

            if done:
                stats["episodes"][episode] = {"steps": step, "reward": reward}
                totalReward += reward
                break

    successRate = ((successes / episodes) * 100)
    print("Finished")
    print("Success Rate: " + str(successRate) + "%")
    print("Total Reward: " + str(totalReward))
    # log stats
    stats["successrate"] = successRate
    stats["totalreward"] = totalReward
    return stats
Ejemplo n.º 10
0
    def __init__(self, problem_id, map_name_base="8x8-base"):
        # map_name_base="4x4-base"
        if not (0 <= problem_id <= 7):
            raise ValueError("Problem ID must be 0 <= problem_id <= 7")

        self.map_name_base = map_name_base
        self.env = LochLomondEnv(problem_id=problem_id,
                                 is_stochastic=self.is_stochastic(),
                                 reward_hole=self.reward_hole(),
                                 map_name_base=map_name_base)

        self.problem_id = problem_id
        self.reset()
        self.out = 'out/'
        self.policy = {}
        self._train = []
        self.graphs = {}
Ejemplo n.º 11
0
    def test_env(self):
        env = LochLomondEnv(problem_id=0, is_stochastic=True, 
                            reward_hole=-0.02, map_name_base="4x4-base")

        self.assertEqual(b'S', env.desc[0,0])
        self.assertEqual(b'F', env.desc[0,1])
        self.assertEqual(b'H', env.desc[1,1])
        self.assertEqual(b'G', env.desc[3,0])
Ejemplo n.º 12
0
    def test_env_mdp(self):
        env = LochLomondEnv(problem_id=1, is_stochastic=True, 
                            reward_hole=-0.2, map_name_base="4x4-base")

        mdp = EnvMDP(env)
        self.assertEqual(4, mdp.rows)
        self.assertEqual(4, mdp.cols)
        self.assertAlmostEqual(-0.2, mdp.grid[3][0])
        self.assertTrue((0, 1) in mdp.states)
        self.assertEqual((1, 1), mdp.terminals[0])
Ejemplo n.º 13
0
    def test_env_2_grid(self):
        env = LochLomondEnv(problem_id=0, is_stochastic=True, 
                            reward_hole=-0.2, map_name_base="4x4-base")

        mdp = EnvMDP(env)
        grid = EnvMDP.to_grid_matrix(env)
        self.assertEqual(0, grid[0,0])
        self.assertEqual(0, grid[0,1])
        self.assertEqual(-0.2, grid[1,1])
        self.assertEqual(env.reward, grid[3,0])
Ejemplo n.º 14
0
def train_for_one_problem(problem_id, map_name):
    problem_id = problem_id  # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = 0.0  # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice)
    is_stochastic = False  # should be False for A-star (deterministic search) and True for the RL agent

    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=is_stochastic,
                        reward_hole=reward_hole,
                        map_name_base=map_name)
    env.reset()

    done = False
    total_test_num = 60000
    restart_times = 0
    succeed_times = 0
    shortest_path = 100
    one_map_succeed_percentage = []

    for i in range(total_test_num):
        restart_times += 1
        done = False
        n_actions_for_episode = 0
        while not done:
            n_actions_for_episode += 1
            action = env.action_space.sample(
            )  # take random action from the available actions
            observation, reward, done, info = env.step(action)

            if done:
                print("\rProblem:%s Episodes #%s / 60000" %
                      (problem_id, restart_times),
                      end='')
                if reward == 1.0:
                    if shortest_path > n_actions_for_episode:
                        shortest_path = n_actions_for_episode
                    succeed_times += 1
                else:
                    env.reset()

    print("\nSucceed Times:", succeed_times)
    print("Total Times:", total_test_num)
    print("Shortest path:", shortest_path)

    one_map_succeed_percentage = float(succeed_times / 60000)
    return one_map_succeed_percentage
    env.close()
Ejemplo n.º 15
0
    def test_env_2_terminals(self):
        env = LochLomondEnv(problem_id=1, is_stochastic=True, 
                            reward_hole=-0.2, map_name_base="4x4-base")

        mdp = EnvMDP(env)
        terminals = EnvMDP.to_position(env, letter=b'GH')

        self.assertEqual((1, 1), terminals[0])
        self.assertEqual((3, 1), terminals[1])
        self.assertEqual((3, 2), terminals[2])
        self.assertEqual((0, 3), terminals[3])
        self.assertEqual((1, 3), terminals[4])
Ejemplo n.º 16
0
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=False,
                        reward_hole=reward_hole)

    statespace_locs, statespace_actions, statespace_init, statespace_goal = env2statespace(
        env)
    maze_problem = GraphProblem(statespace_init, statespace_goal,
                                UndirectedGraph(statespace_actions))

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        print('-' * 50)
        env.reset()
        func = memoize(maze_problem.h, 'func')
        frontier = PriorityQueue('min', func)
        node = Node(maze_problem.initial)
        frontier.append(node)
        seen = set()
        for iter in range(max_iters_per):
            node = frontier.pop()
            print(",".join([str(episode), str(iter), node.state]))
            if maze_problem.goal_test(node.state):
                print('done')
                results.append({'iters': iter, 'success': True})
                break
            seen.add(node.state)
            for possible in node.expand(maze_problem):
                if possible.state not in seen and possible not in frontier:
                    frontier.append(possible)
                elif possible in frontier:
                    if func(possible) < frontier[possible]:
                        del frontier[possible]
                        frontier.append(possible)

    return results
def search_for_one_solution(problem_id, map_name, plot_or_not):
    problem_id = problem_id
    reward_hole = 0.0
    is_stochastic = False
    if  map_name == '4x4-base':
        n_dim = 4
    else:
        n_dim = 8

    env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name)
    env.reset()
    # Create a dict representation of the state space
    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

    #--------------SOLUTION--------------#
    maze_map = UndirectedGraph(state_space_actions)
    maze_map.locations = state_space_locations
    maze_problem = GraphProblem(state_initial_id, state_goal_id, maze_map)

    iterations, _, node = my_astar_search_graph(problem=maze_problem, h=None)
    #-------------Trace the solution-----------------#
    solution_path = [node]
    cnode = node.parent
    solution_path.append(cnode)
    i = 0
    while cnode.state != state_initial_id:
        i += 1
        cnode = cnode.parent
        solution_path.append(cnode)

    solution = []
    solution_x = []
    solution_y = []
    for s in str(solution_path).split('_',-1):
        for s_s in str(s).split('>',-1):
            if s_s.isdigit():
                solution.append(s_s)
    for i in range(int(len(solution)/2)):
        solution_y.append(int(solution[i*2]))
        solution_x.append(int(solution[i*2+1]))

    print("Steps:",i)
    print("Goal state:"+str(solution_path[0]))
    print("Final Solution:",solution_path[::-1])
    print("----------------------------------------")
    env.close()

    plt.cla()
    plt.plot(solution_x[::-1], solution_y[::-1])
    plt.scatter(solution_x[::-1], solution_y[::-1],s=120)
    plt.xlim(0,n_dim-1)
    plt.ylim(n_dim-1,0)
    plt.grid(True)
    plt.title("Simple Agent Solution for Problem%s" % problem_id)
    plt.savefig('./Images/%sx%s maps: Simple Agent Solution for Problem%s.jpg' % (n_dim,n_dim,problem_id))
    print("Figure Saved in Folder 'Images'")
    if plot_or_not == True:
        plt.show()
Ejemplo n.º 18
0
def environment():
    # Set up the Environment
    problem_id = int(sys.argv[1])
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=False, reward_hole=-0.01)

    total_episodes = 10000
    max_steps = 1000
    lr_rate = 0.80
    gamma = 0.96
    epsilon = 0.9

    Q = np.zeros((env.observation_space.n, env.action_space.n))
    return env, problem_id, epsilon, total_episodes, max_steps, lr_rate, gamma, Q
Ejemplo n.º 19
0
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        env.reset()
        print('-' * 50)
        print_headers()

        for iteration in range(max_iters_per):
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            print(",".join([
                str(episode),
                str(iteration),
                str(reward),
                str(done),
                str(info),
                str(action)
            ]))

            if done and reward == reward_hole:
                env.render()
                print("Hole Found in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': False})
                break

            if done and reward == 1.0:
                env.render()
                print("Frisbee acquired in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': True})
                break

    return results
Ejemplo n.º 20
0
"""
Runs the LochLomondEnv problem using the random agent. Takes as input
a command line argument which specifies the problem ID.
"""
import sys
from uofgsocsai import LochLomondEnv
from random_agent import RandomAgent, process_data_random
from constants import (REWARD_HOLE_RANDOM, MAX_EPISODES, MAX_ITERS_PER_EPISODE,
                       IS_STOCHASTIC_RANDOM)

if len(sys.argv) == 2:
    PROBLEM_ID = int(sys.argv[1])
else:
    PROBLEM_ID = 0

env = LochLomondEnv(problem_id=PROBLEM_ID,
                    is_stochastic=IS_STOCHASTIC_RANDOM,
                    reward_hole=REWARD_HOLE_RANDOM)
rand_agent = RandomAgent(env)
process_data_random(env, rand_agent, MAX_EPISODES, MAX_ITERS_PER_EPISODE,
                    REWARD_HOLE_RANDOM, PROBLEM_ID)
Ejemplo n.º 21
0
def main(problem_id, map_name_base): 
    #simple agent referenced and adapted from lab 4 notebook by tutor prof.bjorn jensen for ai course (2019-20)
    if(problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Probleam ID should be between 0 and 7")
    
    if(map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base 
    else:
        print("Map base can be 8x8-base or 4x4-base")
    
    reward_hole = -1.0     
    is_stochastic = False  

    max_episodes = 10000 

    env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole)

    env.action_space.sample() 

    print(env.desc)
    EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"])
    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

    frozen_lake_map = UndirectedGraph(state_space_actions)
    frozen_lake_map.locations = state_space_locations
    frozen_lake_problem = GraphProblem(state_initial_id, state_goal_id, frozen_lake_map)

    all_node_colors=[]
    iterations, all_node_colors, node = my_astar_search_graph(problem=frozen_lake_problem, h=None)

    solution_path = [node]
    cnode = node.parent
    solution_path.append(cnode)
    while cnode.state != "S_00_00":    
        cnode = cnode.parent
        if cnode is None:
            break
        solution_path.append(cnode)


    steps = solution_path[::-1]
    # Reset the random generator to a known state (for reproducibility)
    np.random.seed(12)
    
    observation = env.reset() # reset the state of the env to the starting state     

    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.ones(max_episodes))
    for e in range(max_episodes): # iterate over episodes

        observation = env.reset() # reset the state of the env to the starting state     

        for i in range(len(steps)-1):
            action =  get_action_from_states(steps[i],steps[i+1])# your agent goes here (the current agent takes random actions)
            
            observation, reward, done, info = env.step(action) # observe what happends when you take the action
            # update stats
            stats.episode_rewards[e] = reward
            stats.episode_lengths[e] = i
    
          # Check if we are done and monitor rewards etc...
        if (done):
        
            print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]")
            break

    return (stats)
Ejemplo n.º 22
0
def main(problem_id, map_name_base):
    #rl agent referenced from lab 8 and 9 notebooks provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow
    if (problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Problem ID should be between 0 and 7")

    if (map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base
    else:
        print("Map base can be 8x8-base or 4x4-base")

    reward_hole = -0.05  #Hole penalty is set based on analysis to ensure that the reward is maximized
    is_stochastic = True
    EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
    map_name_base = map_name_base

    np.random.seed(12)
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=is_stochastic,
                        reward_hole=reward_hole,
                        map_name_base=map_name_base)

    states = env.observation_space.n
    actions = env.action_space.n
    Q = np.zeros((states, actions))

    max_episodes = 10000
    max_iter_per_episode = 1000

    alpha = 0.1  #learning rate
    gamma = 0.999  #discount rate
    epsilon = 1
    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),
                         episode_rewards=np.zeros(max_episodes))

    for episode in range(max_episodes):
        state = env.reset()

        for step in range(max_iter_per_episode):
            # take best action according to Q-table if random value is greater than epsilon, otherwise take a random action
            random_value = random.uniform(0, 1)
            if random_value > epsilon:
                action = np.argmax(Q[state, :])  #Agent goes here
            else:
                action = env.action_space.sample()

            new_state, reward, done, info = env.step(action)
            Q[state, action] = Q[state, action] + alpha * (
                reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
            stats.episode_rewards[episode] += reward
            stats.episode_lengths[episode] = step
            state = new_state

            if done:
                break

        epsilon = 0.01  #epsilon is set to a low value to make sure of the exploitation

    print(Q)

    return (stats)
Ejemplo n.º 23
0
def train_for_one_model(problem_id, map_name, train_or_not):
    problem_id = problem_id       # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = -0.01      # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice)
    is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

    if  map_name == '4x4-base':
        n_dim = 4
        num_episodes = 100000
    else:
        num_episodes = 300000
        n_dim = 8

    env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name)

    restart_times = 0
    n_actions_for_episode = 0
    rewards_all_episodes = []
    rewards_all_episodes_per_2000 = []
    x_axis_rewardsvsepisodes = []
    episode_steps = []
    max_steps_per_episode = 10000
    exploration_rate = 0.5
    q_table = np.zeros([env.observation_space.n,env.action_space.n])
    learning_rate = 0.3
    discount = 0.5
    if problem_id == 0 and n_dim == 8:
        learning_rate = 0.2
        discount = 0.8
    if problem_id == 0 and n_dim == 4:
        learning_rate = 0.4
        discount = 0.7
    epsilon_min = 0.005
    epsilon_decay_rate = 0.99995
    shortest_path = 10000
    longest_path = 0
    avg_path = []
    Train_or_not = train_or_not

    if Train_or_not == True:
        #--------------Training Process-----------------#
        for episode in range(num_episodes):
            restart_times += 1
            state = env.reset()
            done = False
            rewards_current_episode = 0
            path = [state]
            if restart_times % 5000 == 0:
                print("\ntraining in progress: #", restart_times)
            for step in range(max_steps_per_episode):
                n_actions_for_episode += 1
                # Exploration - exploitation trade-off
                exploration_exploitation_rate = random.uniform(0, 1)
                epsilon = 0.3

                if exploration_exploitation_rate < epsilon or q_table[state, :].all() == 0:
                    action = env.action_space.sample() # Exploration Method 20% i.e take random action from the available actions
                else:
                    action = np.argmax(q_table[state, :] + np.random.randn(1,4)) # Exploitation Method 80% i.e select the action with max value

                new_state, reward, done, info = env.step(action)
                path.append(new_state)

                # Update Q-table
                q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount * np.max(q_table[new_state, :]) - q_table[state, action])

                state = new_state
                # rewards_current_episode += reward

                if done == True and reward == 1:
                    print("\rEpisode #%s: Finish it within %d steps" % (restart_times, len(path)),end = '')
                    break
                if done == True and reward == -0.01:
                    break

            # epsilon decay
            if epsilon >= epsilon_min:
                epsilon *= epsilon_decay_rate

            # rewards_all_episodes.append(rewards_current_episode)

            # if restart_times % 2000 == 0:
            #     avg_reward_2000 = np.sum(rewards_all_episodes) / (2000 * (restart_times / 2000))
            #     rewards_all_episodes_per_2000.append(avg_reward_2000)
            #     x_axis_rewardsvsepisodes.append(2000 * (restart_times / 2000))
        #---------------SAVE THE MODEL--------------------#
        np.save('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id), q_table)

    #--------FINAL TEST-----------#
    if(train_or_not == True):
        print("\nRunning Test for 50000 times. Please wait...")
    q_table = np.load('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id)) # Load the trained model q table
    env.reset()
    state = env.reset()
    test_total_num = 50000
    test_fail_num = 0
    test_succeed_num = 0
    Avg_rewards_per_1000_episodes = []
    Avg_reward_per_step = []
    Avg_reward_per_episode = []


    for k in range(test_total_num):
        s = env.reset()
        j=0
        rewards_temp = 0

        while j < 1000:

            j += 1
            action = np.argmax(q_table[s,:])
            new_state,r,done,b = env.step(action)
            rewards_temp += r
            s = new_state
            if done and r == -0.01:
                test_fail_num += 1
                break
            if done and r == 1.0:
                avg_path.append(j)
                if shortest_path > j:
                    shortest_path = j
                if longest_path < j:
                    longest_path = j
                test_succeed_num += 1.0
                break
            if j == 1000:
                test_fail_num += 1
        Avg_reward_per_episode.append(rewards_temp)
        Avg_reward_per_step.append(rewards_temp / j)

        if k % 1000 == 0:
            Avg_rewards_per_1000_episodes.append(np.sum(Avg_reward_per_episode) / int(1000 * float(k / 1000)))
            x_axis_rewardsvsepisodes.append(1000 * (k / 1000))
    #--------------OUTPUT FINAL RESULT-----------------#
    if (train_or_not == True):
        print("\n-------------------------------------------")
        print("Average rewards per 1000 episodes:",Avg_rewards_per_1000_episodes[-1])
        print("Average rewards per steps:", Avg_reward_per_step[-1])
        print("Success times:",test_succeed_num)
        print("Failure times:",test_fail_num)
        print("Success rate:",float(test_succeed_num / test_total_num))
        print("Success vs Failure rate:",float(test_succeed_num / test_fail_num))
        print("Steps number (Best case):",shortest_path)
        print("Steps number (Worst case):",longest_path)
        print("Steps number (On average):",np.mean(avg_path))
        print("Learning rate:",learning_rate)

    plt.cla()
    plt.plot(x_axis_rewardsvsepisodes[:], Avg_rewards_per_1000_episodes[:])
    plt.savefig('./Images/%sx%s maps: Average Rewards of Problem%s.jpg' % (n_dim,n_dim,problem_id))
    if (train_or_not == True):
        print("Figure Saved in Folder 'Images'")
        plt.show()
    return test_succeed_num, test_fail_num, shortest_path, longest_path,np.mean(avg_path), learning_rate, Avg_rewards_per_1000_episodes[-1], Avg_reward_per_step[-1]
Ejemplo n.º 24
0
import numpy as np
import time
from uofgsocsai import LochLomondEnv
import os, sys
from helpers import *
import networkx as nx
from search import *


# Setup the parameters
problem_id = int(sys.argv[1])
reward_hole = 0.0
is_stochastic = True

# Generate the environment
env = LochLomondEnv(problem_id=problem_id, is_stochastic=False,   reward_hole=reward_hole)


print(env.desc)

state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

maze_map = UndirectedGraph(state_space_actions)

# initialise a graph
G = nx.Graph()

node_labels = dict()
node_colors = dict()
for n, p in state_space_locations.items():
    G.add_node(n)            # add nodes from locations
Ejemplo n.º 25
0
def main(p_id):
    # Setup the parameters for the specific problem (you can change all of these if you want to)
    problem_id = int(p_id)    # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = 0.0     # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
    is_stochastic = False  # should be False for A-star (deterministic search) and True for the RL agent

    max_episodes = 2000
    max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode

    # Generate the specific problem
    env = LochLomondEnv(problem_id=problem_id, is_stochastic=False,   reward_hole=reward_hole)

    # Let's visualize the problem/env
    # print("grid= \n")
    # print(env.desc)
    # env.render
    g = Grid(env.desc)

    # Create a representation of the state space for use with AIMA A-star
    # state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

    # print(state_goal_id)
    # Reset the random generator to a known state (for reproducability)
    np.random.seed(12)
    #setup vars for logfile
    f= open("out_AStar_{}.txt".format(problem_id) ,"w+")
    successes = 0
    failures = 0
    ####
    for e in range(max_episodes): # iterate over episodes
        observation = env.reset() # reset the state of the env to the starting state
        steps = aStar(g)
        for iter in range(max_iter_per_episode):
            # env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line

            action = steps[iter]
            # print(action)
            observation, reward, done, info = env.step(action) # observe what happends when you take the action

            #         # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

            # Check if we are done and monitor rewards etc...
            if(done and reward==reward_hole):
            # env.render()
                # print("Failure")
                failures += 1
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Fail\n")
                # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n")
                break

            if (done and reward == +1.0):
                # env.render()
                successes += 1
                # print("Success")
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Success\n")
                # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n")
                break


    f.write("Successes: " + str(successes))
    f.write("\n")
    f.write("Failures: " + str(failures))
    successRate = successes / max_episodes * 100
    dict = {"Success": successes,
            "Failures": failures,
            "Episodes": max_episodes,
            "SuccessRate": successRate}
    # print(dict)
    return dict
import os, sys
from helpers import *
print("Working dir:" + os.getcwd())
print("Python version:" + sys.version)

# Setup the parameters for the specific problem (you can change all of these if you want to)
problem_id = 0  # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
reward_hole = 0.0  # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

max_episodes = 2000  # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all!
max_iter_per_episode = 500  # you decide how many iterations/actions can be executed per episode

# Generate the specific problem
env = LochLomondEnv(problem_id=problem_id,
                    is_stochastic=False,
                    reward_hole=reward_hole)

# Let's visualize the problem/env
print(env.desc)

# Create a representation of the state space for use with AIMA A-star
state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(
    env)

# Reset the random generator to a known state (for reproducability)
np.random.seed(12)

####
for e in range(max_episodes):  # iterate over episodes
    observation = env.reset(
Ejemplo n.º 27
0
from uofgsocsai import LochLomondEnv
import os
import sys
from helpers import *

problem_id = 0
reward_hole = 0.0
is_stochastic = True
max_episodes = 2000
max_iter_per_episode = 2000

map_name_base = "8x8-base"

# Generate the specific problem
env = LochLomondEnv(problem_id=problem_id,
                    is_stochastic=is_stochastic,
                    map_name_base=map_name_base,
                    reward_hole=reward_hole)

# Create a representation of the state space for use with AIMA A-star
state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(
    env)
print(state_space_locations)

# Reset the random generator to a known state (for reproducibility)
np.random.seed(12)

# Run a random/senseless agent
for e in range(max_episodes):  # iterate over episodes
    observation = env.reset(
    )  # reset the state of the env to the starting state
Ejemplo n.º 28
0
from uofgsocsai import LochLomondEnv # load the class defining the custom Open AI Gym problem

# Setup the parameters for the specific problem (you can change all of these if you want to) 
problem_id = 0        # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent 
reward_hole = 0.0     # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

max_episodes = 2000   # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all!
max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode


observation_list= list()
reward_list= list()

# Generate the specific problem 
env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole)

# Let's visualize the problem/env
print('env',env.desc)

# Reset the random generator to a known state (for reproducability)
np.random.seed(12)

for e in range(max_episodes): # iterate over episodes
    observation = env.reset() # reset the state of the env to the starting state     
    
    for iter in range(max_iter_per_episode):
      env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line
      #action = env.action_space.sample() # your agent goes here (the current agent takes random actions)
      random= runRandom()
      action= random.action()
Ejemplo n.º 29
0
                       IS_STOCHASTIC_Q)
from q_agent import QLearningAgent, process_data_q
from random_agent import RandomAgent, process_data_random
from simple_agent import SimpleAgent, process_data_simple
from uofgsocsai import LochLomondEnv

# Reads command line argument and stores in PROBLEM_ID,
# to specify the problem if this hasnt been provided, then
# just set a deafult of 0
if len(sys.argv) == 2:
    PROBLEM_ID = int(sys.argv[1])
else:
    PROBLEM_ID = 0


env_random = LochLomondEnv(
    problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_RANDOM, reward_hole=REWARD_HOLE_RANDOM)
env_simple = LochLomondEnv(
    problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_SIMPLE, reward_hole=REWARD_HOLE_SIMPLE)
env_qlearn = LochLomondEnv(
    problem_id=PROBLEM_ID, is_stochastic=IS_STOCHASTIC_Q, reward_hole=REWARD_HOLE_Q)


start_index = np.where(env_qlearn.desc == b'S')
row, col = start_index[0][0], start_index[1][0]
start = row*8 + col
end_index = np.where(env_qlearn.desc == b'G')
row, col = end_index[0][0], end_index[1][0]
goal = row*8 + col
holes = np.where(env_qlearn == b'H')
terminals = []
for i in range(len(holes[0])):
Ejemplo n.º 30
0
    temp_id = int(sys.argv[1])
except IndexError as identifier:
    print("There is no input number so the problem id is set to default 0.")
    temp_id = 0

# Setup the parameters for the specific problem (you can change all of these if you want to)
# problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
problem_id = temp_id
# should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
reward_hole = 0.0
# should be False for A-star (deterministic search) and True for the RL agent
is_stochastic = True

# Load Environment and Q-table structure
env = LochLomondEnv(problem_id=problem_id,
                    is_stochastic=is_stochastic,
                    reward_hole=reward_hole)

#  learn episode times!
max_episodes = 10000
# you decide how many iterations/actions can be executed per episode
max_iter_per_episode = 2000


# random agent
def random_agent(env, problem_id, max_episodes):
    output_file = f'out_random_{problem_id}.pkl'
    n_states = env.observation_space.n
    random_agent_dict = {}  # define a dict to save random actions
    # set random action to a dict
    for state in range(n_states):