Esempio n. 1
0
def main():
    env = gym.make('FrozenLake-v0')
    rewardWindow = [0 for _ in range(100)]
    qtab = qTable(env.observation_space.n, env.action_space.n)
    
    # Excercise 5 specific (load qTable from ex3)
    ex3qtab = qTable(env.observation_space.n, env.action_space.n)
    f = open("ex3qtable.json", 'r').read()
    ex3qtab.table = json.loads(f)
    epsilon = 0.1
    ep = []
    rew = []
    for i_episode in range(8000):
        observation = env.reset()
        accumulatedReward = 0
        for t in range(10000):
            #Render enviorment
            #env.render()
            #Select action
            action = epsilonGreedy(epsilon, env, observation, qtab)
            #Perform action
            prevObs = observation
            observation, reward, done, info = env.step(action)
            accumulatedReward += reward
            #Update Q
            oldQ = qtab.getQ(prevObs, action)
            currQ = qtab.getQ(observation, ex3qtab.getMaxQAction(observation)) # ex5 specific
            newQ = oldQ + LEARNING_RATE*(reward + DISCOUNT*currQ - oldQ)
            qtab.setQ(prevObs, action, newQ)
            #Check if episode is done
            if done:
                rewardWindow[i_episode % 100] = accumulatedReward
                ep.append(i_episode)
                break
        #Decrease exploration rate 
        epsilon *= 0.9995 # ends up at e = 0.002 after 8000 iterations
        windowAvg = 0
        for i in rewardWindow:
            windowAvg += i
        rew.append(windowAvg/100)
        print(i_episode, " ", windowAvg, end='\r')
        if windowAvg >= 78:
            break
    plt.plot(ep, rew)
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.title('Frozen Lake on policy')
    plt.grid(True)
    plt.savefig("op.png")
    plt.show()
Esempio n. 2
0
def main():
    env = gym.make('FrozenLake-v0')
    rewardWindow = [0 for _ in range(100)]
    qtab = qTable(env.observation_space.n, env.action_space.n)
    epsilon = 0.1
    ep = []
    rew = []
    for i_episode in range(8000):
        observation = env.reset()
        accumulatedReward = 0
        for t in range(10000):
            #Render enviorment
            #env.render()
            #Select action
            action = epsilonGreedy(epsilon, env, observation, qtab)
            #Perform action
            prevObs = observation
            observation, reward, done, info = env.step(action)
            accumulatedReward += reward
            #Update Q
            oldQ = qtab.getQ(prevObs, action)
            maxCurrQ = qtab.getMaxQ(observation)
            newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * maxCurrQ - oldQ)
            qtab.setQ(prevObs, action, newQ)
            #Check if episode is done
            if done:
                rewardWindow[i_episode % 100] = accumulatedReward
                ep.append(i_episode)
                break
        #Decrease exploration rate
        epsilon *= 0.9995  # ends up at e = 0.002 after 8000 iterations
        windowAvg = 0
        for i in rewardWindow:
            windowAvg += i
        print(i_episode, " ", windowAvg, end='\r')
        rew.append(windowAvg / 100)
        if windowAvg >= 78:
            break
    plt.plot(ep, rew)
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.title('Frozen Lake Q learning')
    plt.grid(True)
    plt.savefig("qlrn.png")
    plt.show()
    """
    Export qtable to json
    """
    f = open("ex3qtable.json", 'w')
    f.write(json.dumps(qtab.table))
    f.close()
Esempio n. 3
0
def main():
    env = gym.make('FrozenLake-v0')
    rewardWindow = [0 for _ in range(100)]
    qtab = qTable(env.observation_space.n, env.action_space.n)
    epsilon = 0.1
    ep = []
    rew = []
    for i_episode in range(8000):
        observation = env.reset()
        action = epsilonGreedy(epsilon, env, observation, qtab)
        accumulatedReward = 0
        for t in range(100):
            #Render enviorment
            #env.render()
            #Perform action
            prevObs = observation
            observation, reward, done, info = env.step(action)
            accumulatedReward += reward
            #Select action
            prevAct = action
            action = epsilonGreedy(epsilon, env, observation, qtab)
            #Update Q
            oldQ = qtab.getQ(prevObs, prevAct)
            actQ = qtab.getQ(observation, action)
            newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * actQ - oldQ)
            qtab.setQ(prevObs, prevAct, newQ)
            #Check if episode is done
            if done:
                rewardWindow[i_episode % 99] = accumulatedReward
                ep.append(i_episode)
                break
        #Decrease exploration rate
        epsilon *= 0.9995
        windowAvg = 0
        for i in rewardWindow:
            windowAvg += i
        rew.append(windowAvg / 100)
        print(i_episode, " ", windowAvg, end='\r')
        if windowAvg >= 78:
            break
    plt.plot(ep, rew)
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.title('Frozen Lake SARSA')
    plt.grid(True)
    plt.savefig("sarsa.png")
    plt.show()
Esempio n. 4
0
def main():
    env = gym.make('Taxi-v1')
    rewardWindow = [0 for _ in range(100)]
    qtab = qTable(env.observation_space.n, env.action_space.n)
    epsilon = 0.1
    ep = []
    rew = []
    for i_episode in range(80000):
        observation = env.reset()
        accumulatedReward = 0
        for t in range(10000):
            #Render enviorment
            env.render()
            #Select action
            action = epsilonGreedy(epsilon, env, observation, qtab)
            #Perform action
            prevObs = observation
            observation, reward, done, info = env.step(action)
            accumulatedReward += reward
            #Update Q
            oldQ = qtab.getQ(prevObs, action)
            maxCurrQ = qtab.getMaxQ(observation)
            newQ = oldQ + LEARNING_RATE * (reward + DISCOUNT * maxCurrQ - oldQ)
            qtab.setQ(prevObs, action, newQ)
            #Check if episode is done
            if done:
                rewardWindow[i_episode % 100] = accumulatedReward
                ep.append(i_episode)
                rew.append(accumulatedReward)
                break
        #Decrease exploration rate
        epsilon *= 0.9995
        windowAvg = 0
        for i in rewardWindow:
            windowAvg += i
        print(i_episode, " ", windowAvg, end='\r')
        if windowAvg >= 970:
            break
    plt.plot(ep, rew)
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.title('Taxi Q learning')
    plt.grid(True)
    plt.savefig("qlrn.png")
    plt.show()