def test_q_learning_slots(): """ Tests that the Qlearning implementation successfully finds the slot machine with the largest expected reward. """ from code import QLearning np.random.seed(0) env = gym.make('SlotMachines-v0', n_machines=10, mean_range=(-10, 10), std_range=(5, 10)) env.seed(0) means = np.array([m.mean for m in env.machines]) agent = QLearning(epsilon=0.2, discount=0) state_action_values, rewards = agent.fit(env, steps=10000) assert state_action_values.shape == (1, 10) assert len(rewards) == 100 assert np.argmax(means) == np.argmax(state_action_values) states, actions, rewards = agent.predict(env, state_action_values) assert len(actions) == 1 and actions[0] == np.argmax(means) assert len(states) == 1 and states[0] == 0 assert len(rewards) == 1
def test_q_learning_deterministic(): """ Tests that the QLearning implementation successfully navigates a deterministic environment with provided state-action-values. """ from code import QLearning np.random.seed(0) env = gym.make('FrozonLakeNoSlippery-v0') env.seed(0) agent = QLearning(epsilon=0.5, discount=0.95) state_action_values = np.array([[0.0, 0.7, 0.3, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.51, 0.49, 0.0], [0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.5, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.2, 0.8, 0.0], [0.0, 0.2, 0.8, 0.0], [0.0, 0.6, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0]]) states, actions, rewards = agent.predict(env, state_action_values) assert np.all(states == np.array([4, 8, 9, 10, 14, 15])) assert np.all(actions == np.array([1, 1, 2, 2, 1, 2])) assert np.all(rewards == np.array([0, 0, 0, 0, 0, 1]))
def test_q_learning_frozen_lake(): """ Tests that the QLearning implementation successfully learns the FrozenLake-v0 environment. """ from code import QLearning np.random.seed(0) env = gym.make('FrozenLake-v0') env.seed(0) agent = QLearning(epsilon=0.2, discount=0.95) state_action_values, rewards = agent.fit(env, steps=10000) state_values = np.max(state_action_values, axis=1) assert state_action_values.shape == (16, 4) assert len(rewards) == 100 assert np.allclose(state_values[np.array([5, 7, 11, 12, 15])], np.zeros(5)) assert np.all(state_values[np.array([0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 14])] > 0)
import gym import numpy as np from code import MultiArmedBandit from code import QLearning import matplotlib.pyplot as plt env = gym.make('SlotMachines-v0') agent1 = MultiArmedBandit() rewardsarray1 = [] rewardsarray2 = [] agent2 = QLearning() for i in range(10): action_values1, rewards1 = agent1.fit(env, steps = 100000) rewardsarray1.append(rewards1) action_values2, rewards2 = agent2.fit(env, steps= 100000) rewardsarray2.append(rewards2) rewardsarray1half = [] rewardsarray2half = [] for i in range(5): rewardsarray1half.append(rewardsarray1[i]) rewardsarray2half.append(rewardsarray2[i]) rewardsarray1 = np.asarray(rewardsarray1) rewardsarray2 = np.asarray(rewardsarray2) rewardsarray1half = np.asarray(rewardsarray1half) rewardsarray2half = np.asarray(rewardsarray2half) halfmean1 = np.mean(rewardsarray1half, axis = 0) halfmean2 = np.mean(rewardsarray2half, axis = 0) fullmean1 = np.mean(rewardsarray1, axis = 0)
import gym import numpy as np from code import QLearning import matplotlib.pyplot as plt np.random.seed(0) env = gym.make('FrozenLake-v0') env.seed(0) agent1 = QLearning(epsilon=0.2, discount=0.95) agent2 = QLearning(epsilon=0.5, discount=0.95) rewardsarray1 = [] rewardsarray2 = [] for i in range(10): action_values1, rewards1 = agent1.fit(env, steps=100000) rewardsarray1.append(rewards1) action_values2, rewards2 = agent2.fit(env, steps=100000) rewardsarray2.append(rewards2) rewardsarray1 = np.asarray(rewardsarray1) rewardsarray2 = np.asarray(rewardsarray2) fullmean1 = np.mean(rewardsarray1, axis=0) fullmean2 = np.mean(rewardsarray2, axis=0) x_vals = np.arange(100) plt.plot(x_vals, fullmean1, label="Rewards Average for epsilon .2") plt.plot(x_vals, fullmean2, label="Rewards Average for epsilon .5") plt.ylabel("rewards for QLearner on Frozen Lake") plt.xlabel("index") plt.legend() plt.show