Ejemplo n.º 1
0
def test_q_learning_slots():
    """
    Tests that the Qlearning implementation successfully finds the slot
    machine with the largest expected reward.
    """
    from code import QLearning

    np.random.seed(0)

    env = gym.make('SlotMachines-v0',
                   n_machines=10,
                   mean_range=(-10, 10),
                   std_range=(5, 10))
    env.seed(0)
    means = np.array([m.mean for m in env.machines])

    agent = QLearning(epsilon=0.2, discount=0)
    state_action_values, rewards = agent.fit(env, steps=10000)

    assert state_action_values.shape == (1, 10)
    assert len(rewards) == 100
    assert np.argmax(means) == np.argmax(state_action_values)

    states, actions, rewards = agent.predict(env, state_action_values)
    assert len(actions) == 1 and actions[0] == np.argmax(means)
    assert len(states) == 1 and states[0] == 0
    assert len(rewards) == 1
Ejemplo n.º 2
0
def test_q_learning_deterministic():
    """
    Tests that the QLearning implementation successfully navigates a
    deterministic environment with provided state-action-values.
    """
    from code import QLearning

    np.random.seed(0)

    env = gym.make('FrozonLakeNoSlippery-v0')
    env.seed(0)

    agent = QLearning(epsilon=0.5, discount=0.95)
    state_action_values = np.array([[0.0, 0.7, 0.3, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0],
                                    [0.0, 0.51, 0.49, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.5, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.2, 0.8, 0.0],
                                    [0.0, 0.2, 0.8, 0.0], [0.0, 0.6, 0.4, 0.0],
                                    [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
                                    [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0],
                                    [0.0, 0.0, 0.0, 0.0]])

    states, actions, rewards = agent.predict(env, state_action_values)
    assert np.all(states == np.array([4, 8, 9, 10, 14, 15]))
    assert np.all(actions == np.array([1, 1, 2, 2, 1, 2]))
    assert np.all(rewards == np.array([0, 0, 0, 0, 0, 1]))
Ejemplo n.º 3
0
def test_q_learning_frozen_lake():
    """
    Tests that the QLearning implementation successfully learns the
    FrozenLake-v0 environment.
    """
    from code import QLearning

    np.random.seed(0)

    env = gym.make('FrozenLake-v0')
    env.seed(0)

    agent = QLearning(epsilon=0.2, discount=0.95)
    state_action_values, rewards = agent.fit(env, steps=10000)

    state_values = np.max(state_action_values, axis=1)

    assert state_action_values.shape == (16, 4)
    assert len(rewards) == 100

    assert np.allclose(state_values[np.array([5, 7, 11, 12, 15])], np.zeros(5))
    assert np.all(state_values[np.array([0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 14])] > 0)
import gym
import numpy as np
from code import MultiArmedBandit
from code import QLearning
import matplotlib.pyplot as plt

env = gym.make('SlotMachines-v0')
agent1 = MultiArmedBandit()
rewardsarray1 = []
rewardsarray2 = []
agent2 = QLearning()
for i in range(10):
    action_values1, rewards1 = agent1.fit(env, steps = 100000)
    rewardsarray1.append(rewards1)
    action_values2, rewards2 = agent2.fit(env, steps= 100000)
    rewardsarray2.append(rewards2)

rewardsarray1half = []
rewardsarray2half = []
for i in range(5):
    rewardsarray1half.append(rewardsarray1[i])
    rewardsarray2half.append(rewardsarray2[i])

rewardsarray1 = np.asarray(rewardsarray1)
rewardsarray2 = np.asarray(rewardsarray2)
rewardsarray1half = np.asarray(rewardsarray1half)
rewardsarray2half = np.asarray(rewardsarray2half)

halfmean1 = np.mean(rewardsarray1half, axis = 0)
halfmean2 = np.mean(rewardsarray2half, axis = 0)
fullmean1 = np.mean(rewardsarray1, axis = 0)
Ejemplo n.º 5
0
import gym
import numpy as np
from code import QLearning
import matplotlib.pyplot as plt

np.random.seed(0)
env = gym.make('FrozenLake-v0')
env.seed(0)

agent1 = QLearning(epsilon=0.2, discount=0.95)
agent2 = QLearning(epsilon=0.5, discount=0.95)
rewardsarray1 = []
rewardsarray2 = []
for i in range(10):
    action_values1, rewards1 = agent1.fit(env, steps=100000)
    rewardsarray1.append(rewards1)
    action_values2, rewards2 = agent2.fit(env, steps=100000)
    rewardsarray2.append(rewards2)

rewardsarray1 = np.asarray(rewardsarray1)
rewardsarray2 = np.asarray(rewardsarray2)
fullmean1 = np.mean(rewardsarray1, axis=0)
fullmean2 = np.mean(rewardsarray2, axis=0)
x_vals = np.arange(100)

plt.plot(x_vals, fullmean1, label="Rewards Average for epsilon .2")
plt.plot(x_vals, fullmean2, label="Rewards Average for epsilon .5")
plt.ylabel("rewards for QLearner on Frozen Lake")
plt.xlabel("index")
plt.legend()
plt.show