### PART III: MDP 1 epsilon experiments epsilon_list = [0.1, 0.25, 0.5, 0.75] learning_rate = 0.01 epoch_list = [] avg_reward_list = [] for e, epsilon in enumerate(epsilon_list): print "Epsilon: {0}".format(epsilon) qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate) epoch_list.append(range(num_learning_epochs)) avg_reward_list.append([]) for epoch in epoch_list[e]: for trial in range(num_learning_trials): qlearner.run_learning_trial() avg_reward = 0 for trial in range(num_simulation_trials): (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial() avg_reward += total_reward avg_reward = 1.*avg_reward/num_simulation_trials avg_reward_list[e].append(avg_reward) print "MDP1 epoch {0}: {1}".format(epoch, avg_reward) Plot.plot_multiple(epoch_list, avg_reward_list, [str(e) for e in epsilon_list], 'epsilon', 'MDP1 Learning: Epsilon', 'mdp1_epsilon_plot.png') print ### PART III: MDP 1 alpha experiments epsilon = 0.25
epoch_list = [] avg_reward_list = [] for e, epsilon in enumerate(epsilon_list): print "Epsilon: {0}".format(epsilon) qlearner = QLearner(mdp1, initial_state1, epsilon=epsilon, alpha=learning_rate) epoch_list.append(range(num_learning_epochs)) avg_reward_list.append([]) for epoch in epoch_list[e]: for trial in range(num_learning_trials): qlearner.run_learning_trial() avg_reward = 0 for trial in range(num_simulation_trials): (total_reward, state_seq, action_seq) = qlearner.run_simulation_trial() avg_reward += total_reward avg_reward = 1. * avg_reward / num_simulation_trials avg_reward_list[e].append(avg_reward) print "MDP1 epoch {0}: {1}".format(epoch, avg_reward) Plot.plot_multiple(epoch_list, avg_reward_list, [str(e) for e in epsilon_list], 'epsilon', 'MDP1 Learning: Epsilon', 'mdp1_epsilon_plot.png') print