rl_glue.rl_episode(0) else: # Runs an episode while keeping track of visited states state, action = rl_glue.rl_start() state_visits[state] += 1 is_terminal = False while not is_terminal: # # stop the program # line = sys.stdin.readline() # print 'line=', line # if line == 'q': # sys.exit() reward, state, action, is_terminal = rl_glue.rl_step() state_visits[state] += 1 reward_sums.append(rl_glue.rl_return()) # last_episode_total_reward = rl_glue.rl_return() end_time = time.clock() print "The time of ", episode, " episode:", end_time - start_time print 'q_table:', rl_glue.agent.q all_reward_sums[algorithm].append(reward_sums) all_state_visits[algorithm].append(state_visits) name = 'results/' + algorithm + '_q_table_r5_e100.npy' np.save(name, rl_glue.agent.q) # save results np.save('results/q_learning_r5_e100.npy', all_reward_sums['Q-learning']) np.save('results/expected_sarsa_r5_e100.npy', all_reward_sums['Expected Sarsa'])
state_visits = np.zeros(48) last_episode_total_reward = 0 for episode in range(num_episodes): if episode < num_episodes - 10: # Runs an episode rl_glue.rl_episode(10000) else: # Runs an episode while keeping track of visited states state, action = rl_glue.rl_start() state_visits[state] += 1 is_terminal = False while not is_terminal: reward, state, action, is_terminal = rl_glue.rl_step() state_visits[state] += 1 reward_sums.append(rl_glue.rl_return() - last_episode_total_reward) last_episode_total_reward = rl_glue.rl_return() all_reward_sums[algorithm].append(reward_sums) all_state_visits[algorithm].append(state_visits) # plot results for algorithm in ["Q-learning", "Expected Sarsa"]: plt.plot(np.mean(all_reward_sums[algorithm], axis=0), label=algorithm) plt.xlabel("Episodes") plt.ylabel("Sum of\n rewards\n during\n episode", rotation=0, labelpad=40) plt.xlim(0, 100) plt.ylim(-30, 0) plt.legend() plt.show()