def test_q_learning_slots(): """ Tests that the Qlearning implementation successfully finds the slot machine with the largest expected reward. """ from src import QLearning np.random.seed(0) env = gym.make('SlotMachines-v0', n_machines=10, mean_range=(-10, 10), std_range=(5, 10)) env.seed(0) means = np.array([m.mean for m in env.machines]) agent = QLearning(epsilon=0.2, discount=0) state_action_values, rewards = agent.fit(env, steps=10000) assert state_action_values.shape == (1, 10) assert len(rewards) == 100 assert np.argmax(means) == np.argmax(state_action_values) states, actions, rewards = agent.predict(env, state_action_values) assert len(actions) == 1 and actions[0] == np.argmax(means) assert len(states) == 1 and states[0] == 0 assert len(rewards) == 1
def test_q_learning_deterministic(): """ Tests that the QLearning implementation successfully navigates a deterministic environment with provided state-action-values. """ from src import QLearning np.random.seed(0) env = gym.make('FrozonLakeNoSlippery-v0') env.seed(0) agent = QLearning(epsilon=0.5, discount=0.95) state_action_values = np.array([[0.0, 0.7, 0.3, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.51, 0.49, 0.0], [0.0, 0.0, 0.0, 0.0], [0.5, 0.0, 0.5, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.2, 0.8, 0.0], [0.0, 0.2, 0.8, 0.0], [0.0, 0.6, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0]]) states, actions, rewards = agent.predict(env, state_action_values) assert np.all(states == np.array([4, 8, 9, 10, 14, 15])) assert np.all(actions == np.array([1, 1, 2, 2, 1, 2])) assert np.all(rewards == np.array([0, 0, 0, 0, 0, 1]))
def FRQ_2_e(): env = gym.make('FrozenLake-v0') multiarmed_rewards_ls = [] for i in range(10): agent = MultiArmedBandit() action_values, multiarmed_rewards = agent.fit(env, steps=100000) multiarmed_rewards_ls.append(multiarmed_rewards) sum_10_trials = np.zeros(100) for i in range(10): sum_10_trials = sum_10_trials + multiarmed_rewards_ls[i] average_10_trials = 1.0 / 10.0 * sum_10_trials plt.plot(np.linspace(0, 99, 100), average_10_trials) Qlearning_rewards_ls = [] for i in range(10): agent = QLearning() action_values, Qlearning_rewards = agent.fit(env, steps=1000) Qlearning_rewards_ls.append(Qlearning_rewards) sum_10_trials = np.zeros(100) for i in range(10): sum_10_trials = sum_10_trials + Qlearning_rewards_ls[i] average_10_trials = 1.0 / 10.0 * sum_10_trials plt.plot(np.linspace(0, 99, 100), average_10_trials) plt.legend(('MultiArmedbandit', 'Qlearning'), loc='lower right') plt.title("FRQ_2e_Qlearning_Multiarmedbandit_rewards_Comparison") plt.xlabel("Steps") plt.ylabel("Reward Values") plt.show()
def test_agent(agent, env, n_runs): t = PrettyTable() t.field_names = ['# objects', 'avg accuracy'] env.generate_random_nobj = False for n_objects in range(1, env.max_CL_objects + 1): env.max_episode_objects = n_objects correct_label_bools = [] for n_run in range(n_runs): env.reset() done = False while not done: state = torch.Tensor(env.state) q_values = QLearning.get_qvalues(state, agent) __, __, done, correct_label = env.step(q_values, env.max_train_iters, state_visit_history) if correct_label: correct_label_bools.append(1) else: correct_label_bools.append(0) t.add_row([n_objects, np.mean(correct_label_bools)]) print(t)
def FRQ_3_a(): env = gym.make('FrozenLake-v0') rewards_001_ls = [] for i in range(10): agent = QLearning(epsilon=0.01) action_values, Qlearning_rewards = agent.fit(env, steps=100000) rewards_001_ls.append(Qlearning_rewards) sum_10_trials = np.zeros(100) for i in range(10): sum_10_trials = sum_10_trials + rewards_001_ls[i] average_10_trials = 1.0 / 10.0 * sum_10_trials plt.plot(np.linspace(0, 99, 100), average_10_trials) rewards_05_ls = [] for i in range(10): agent = QLearning(epsilon=0.5) action_values, Qlearning_rewards = agent.fit(env, steps=100000) rewards_05_ls.append(Qlearning_rewards) sum_10_trials = np.zeros(100) for i in range(10): sum_10_trials = sum_10_trials + rewards_05_ls[i] average_10_trials = 1.0 / 10.0 * sum_10_trials plt.plot(np.linspace(0, 99, 100), average_10_trials) plt.legend(('epsilon=0.01', 'epsilon=0.5'), loc='lower right') plt.title("FRQ_3a_Different_Epsilon_Rewards_Comparison") plt.xlabel("Steps") plt.ylabel("Reward Values") plt.show()
def test_q_learning_frozen_lake(): """ Tests that the QLearning implementation successfully learns the FrozenLake-v0 environment. """ from src import QLearning np.random.seed(0) env = gym.make('FrozenLake-v0') env.seed(0) agent = QLearning(epsilon=0.2, discount=0.95) state_action_values, rewards = agent.fit(env, steps=10000) state_values = np.max(state_action_values, axis=1) assert state_action_values.shape == (16, 4) assert len(rewards) == 100 assert np.allclose(state_values[np.array([5, 7, 11, 12, 15])], np.zeros(5)) assert np.all( state_values[np.array([0, 1, 2, 3, 4, 6, 8, 9, 10, 13, 14])] > 0)
import gym import gym_game import numpy as np from src import QLearning import time start_time = time.time() env = gym.make("RoobetCrash-v0") agent = QLearning(epsilon=.2, discount=0.6, adaptive=True) state_action_values, observation, N = agent.fit(env) env.crash.train_or_test = "test" agent.predict(env=env, state_action_values=state_action_values, observation=observation, N=N) elapsed_time = time.time() - start_time print("Time elapsed: ", elapsed_time)
import matplotlib.pyplot as plt print('Starting example experiment') env = gym.make('FrozenLake-v0') epsilon1 = 0.01 epsilon2 = 0.5 steps = 100000 trials = 10 discount = 0.95 reward_matrix_q1 = np.zeros((trials, 100)) reward_matrix_q2 = np.zeros((trials, 100)) for trial in range(trials): agent_q1 = QLearning(epsilon=epsilon1, discount=discount) agent_q2 = QLearning(epsilon=epsilon2, discount=discount) action_values_q1, rewards_q1 = agent_q1.fit(env, steps=steps) action_values_q2, rewards_q2 = agent_q2.fit(env, steps=steps) reward_matrix_q1[trial, :] = rewards_q1 reward_matrix_q2[trial, :] = rewards_q2 average_rewards_q1 = np.mean(reward_matrix_q1, axis=0) average_rewards_q2 = np.mean(reward_matrix_q2, axis=0) plt.figure() plt.plot(range(len(average_rewards_q1)), average_rewards_q1, color='blue', label='Average rewards of QLearning for epsilon = 0.01')
from src import MultiArmedBandit, QLearning import matplotlib.pyplot as plt print('Starting example experiment') env = gym.make('SlotMachines-v0') steps = 100000 trials = 10 action_matrix_bandits = np.zeros((trials, 10)) reward_matrix_bandits = np.zeros((trials, 100)) action_matrix_q = np.zeros((trials, 10)) reward_matrix_q = np.zeros((trials, 100)) for trial in range(trials): agent_bandits = MultiArmedBandit() agent_q = QLearning() action_values_bandits, rewards_bandits = agent_bandits.fit(env, steps=steps) action_values_q, rewards_q = agent_q.fit(env, steps=steps) action_matrix_bandits[trial, :] = action_values_bandits reward_matrix_bandits[trial, :] = rewards_bandits action_matrix_q[trial, :] = action_values_q reward_matrix_q[trial, :] = rewards_q average_5_rewards_bandits = np.mean(reward_matrix_bandits[:5], axis=0) average_10_rewards_bandits = np.mean(reward_matrix_bandits, axis=0) average_10_rewards_q = np.mean(reward_matrix_q, axis=0) plt.figure() plt.plot(range(len(reward_matrix_bandits[0])),