trail_results = np.zeros((trail, epoch)) for x in range(trail): trail_results[x] = sarsa_cartpole(lr, baseparams, epoch=epoch, eps=eps, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, error, epoch=100, filename='tests.png'): fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(epoch)), data, yerr=error, fmt='o') plt.savefig(filename, dpi=200) plt.show() rewards, error = sarsa_grid_trail(0.5, lambda x: 5, trail=100) ah.save_cp_csvdata(rewards, error, 'sarsa_grid_f_1.csv') draw_plot(rewards, error, filename='testsarsagridf.png') # easy to go inf; should be careful when tuning; difficult to converge; chaos at the first several loops rewards, err = sarsa_cp_trail(5e-4, {'order': 3}, trail=100, eps=0.05) ah.save_cp_csvdata(rewards, err, 'sarsa_cartpole_f_1.csv') draw_plot(rewards, err, filename='testsarsacpf.png')
trail_results[x] = qlearning_cartpole(lr, baseparams, decaylambda, epoch=epoch, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, error, epoch=100, filename='tests.png'): fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(epoch)), data, yerr=error, fmt='o') plt.savefig(filename, dpi=200) plt.show() rewards, error = qlearning_grid_trail(0.5, lambda x: 5, trail=100) ah.save_cp_csvdata(rewards, error, 'qlearning_grid_f_1.csv') draw_plot(rewards, error, filename='testqgridf.png') # easy to go inf; should be careful when tuning; difficult to converge; chaos at the first several loops rewards, err = qlearning_cp_trail(2e-3, {'order': 3}, lambda x: 0.1 * (0.8**(x - 1)) + 0.01, trail=100) ah.save_cp_csvdata(rewards, err, 'q_cartpole_f.csv') draw_plot(rewards, err, filename='testqcpf.png')
trail_results[x] = qlearning_mountaincar(lr, baseparams, eps, epoch=epoch, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, err, epoch=100, filename='testq.png'): fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(epoch)), data, yerr=err, fmt='o') plt.savefig(filename, dpi=200) plt.show() rewards, err = sarsa_mc_trail(2e-2, {'order': 5}, trail=100, eps=lambda x: 0.01) ah.save_cp_csvdata(rewards, err, 'sarsa_mountaincar.csv') draw_plot(rewards, err, filename='testsarsamc.png') rewards, err = qlearning_mc_trail(1e-2, {'order': 5}, trail=100, eps=lambda x: 0.2) ah.save_cp_csvdata(rewards, err, 'qlearning_mountaincar.csv') draw_plot(rewards, err, filename='testqlearningmc.png')
dtheta[:, idx * phi.shape[1]:(idx + 1) * phi.shape[1]] = (1 - pi[idx]) * phi else: dtheta[:, idx * phi.shape[1]:(idx + 1) * phi.shape[1]] = -pi[idx] * phi return dtheta def draw_plot(data, error, epoch=100, filename='tests.png'): plt.xlabel('episode') plt.ylabel('reward') plt.plot(np.array(range(epoch)), data) plt.fill_between(range(epoch), data - error, data + error, alpha=0.3) plt.savefig(filename, dpi=200) plt.show() rewards, error = reinforce_grid_trail(0.047266, lambda x: 2, epoch=200) # 0.097866 ah.save_cp_csvdata(rewards, error, 'rf_grid_alt.csv') draw_plot(rewards, error, filename='rf_grid.png', epoch=200) rewards, err = reinforce_mc_trail(1.675643e-3, 2.124e-3, 0.8, {'order': 7}, lambda x: 0.5, trail=100) ah.save_cp_csvdata(rewards, err, 'rf_mc.csv') draw_plot(rewards, err, filename='rf_mc.png')
def draw_plot(data, error, epoch=100, filename='tests.png'): fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(epoch)), data, yerr=error, fmt='o') plt.savefig(filename, dpi=200) plt.show() # rewards, error = sarsa_grid_trail(1e-2, lambda x: 0.3 if x < 80 else 0.01, trail=100) # ah.save_cp_csvdata(rewards, error, 'sarsa_grid_f_1.csv') # draw_plot(rewards, error, filename='testsarsagridf.png') # # rewards, err = sarsa_cp_trail(8e-3, {'order': 3}, trail=100) # ah.save_cp_csvdata(rewards, err, 'sarsa_cartpole_f_1.csv') # draw_plot(rewards, err, filename='testsarsacpf.png') # rewards, err = (sarsa_cp_trail(1e-2, {'num_tilings': 10, 'tiles_per_tiling': 11}, eps=0.1, base='tile', trail=100)) # ah.save_cp_csvdata(rewards, err, 'sarsa_cartpole_tile.csv') # draw_plot(rewards, err, filename='testscptile1.png') rewards, err = sarsa_cp_trail(1e-3, {'order': 6}, base='rbf', eps=0.01, trail=10) ah.save_cp_csvdata(rewards, err, 'sarsa_cartpole_rbf.csv') draw_plot(rewards, err, filename='testsarsacprbf.png')
def sarsa_lambda_mc_trail(lr, l, baseparams, eps, base='fourier', epoch=100, trail=100): trail_results = np.zeros((trail, epoch)) for x in range(trail): trail_results[x] = sarsa_lambda_mc(lr, l, baseparams, eps, epoch=epoch, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, error, epoch=100, filename='tests.png'): plt.xlabel('episode') plt.ylabel('reward') plt.plot(np.array(range(epoch)), data, 'k') plt.fill_between(range(epoch), data - error, data + error, alpha=0.3) plt.savefig(filename, dpi=200) plt.show() rewards, error = sarsa_lambda_grid_trail(5e-2, 0.95, lambda x: 0.3 if x < 20 else 0.01, trail=100) ah.save_cp_csvdata(rewards, error, 'sarsa_grid.csv') draw_plot(rewards, error, filename='sarsa_grid.png') # rewards, err = sarsa_lambda_mc_trail(1e-2, 0.95, {'order': 3}, trail=100, eps=lambda x: 0.2 if x < 80 else 0.01) # ah.save_cp_csvdata(rewards, err, 'sarsa_mountaincar_25.csv') # draw_plot(rewards, err, filename='sarsa_mc_25.png') rewards, err = sarsa_lambda_mc_trail(1e-2, 0.8, {'order': 7}, trail=100, eps=lambda x: 0.3 if x < 20 else 0.01) ah.save_cp_csvdata(rewards, err, 'sarsa_mc.csv') draw_plot(rewards, err, filename='sarsa_mc.png')
print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards def qlearning_lambda_mc_trail(lr, l, baseparams, eps, base='fourier', epoch=100, trail=100): trail_results = np.zeros((trail, epoch)) for x in range(trail): trail_results[x] = qlearning_lambda_mc(lr, l, baseparams, eps, epoch=epoch, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, error, epoch=100, filename='tests.png'): plt.xlabel('episode') plt.ylabel('reward') plt.plot(np.array(range(epoch)), data, 'k') plt.fill_between(range(epoch), data - error, data + error, alpha=0.3) plt.savefig(filename, dpi=200) plt.show() rewards, error = qlearning_lambda_grid_trail(4e-2, 0.8, lambda x: 0.3 if x < 20 else 0.01, trail=100) ah.save_cp_csvdata(rewards, error, 'q_grid_2.csv') draw_plot(rewards, error, filename='q_grid.png') # rewards, err = qlearning_lambda_mc_trail(8e-3, 0.8, {'order': 5}, trail=1, eps=lambda x: 0.1 if x < 20 else 0.01) rewards, err = qlearning_lambda_mc_trail(8e-3, 0.8, {'order': 5}, trail=100, eps=lambda x: 0.1 if x < 20 else 0.01) ah.save_cp_csvdata(rewards, err, 'q_mc.csv') draw_plot(rewards, err, filename='q_mc.png')
fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(converge_count)), reward_avg, yerr=reward_std, fmt='o') plt.savefig('grid_ce.png', dpi=200) plt.show() toc = time.time() print('running time: ', (toc - tic) / 60, ' mins') return reward_avg, reward_std rewards, err = execute_grid(20, 100) ah.save_cp_csvdata(rewards, err, 'ce_grid.csv') # print('optimized theta: ', grid.pi_params) # theta, cm = cartpole_trail() # print('optimized reward: ', cartpole_evaluate(theta.reshape(4, 2), 50)) # print('optimized theta: ', theta.reshape(4, 2)) # pool = ThreadPoolExecutor(5) # futures = [] # for x in range(5): # futures.append(pool.submit(trail, x))
# trail_num = 3 # converge_count = 250 reward_plt_data = np.zeros((trail_num, converge_count)) for x in range(trail_num): reward_plt_data[x] = np.array(cartpole_trail(converge_count)[2]) reward_std = reward_plt_data.std(0) reward_avg = reward_plt_data.mean(0) fig, ax = plt.subplots() plt.xlabel('episode') plt.ylabel('reward') ax.errorbar(np.array(range(converge_count)), reward_avg, yerr=reward_std, fmt='o') plt.savefig('cartpole_ce.png', dpi=200) plt.show() toc = time.time() print('running time: ', (toc - tic) / 60, ' mins') return reward_avg, reward_std rewards, err = execute_cartpole(20, 100) # rewards_t = np.zeros(100) + 1010 # err_t = np.zeros(100) # rewards_t[: 10] = rewards # err_t[: 10] = err ah.save_cp_csvdata(rewards, err, 'ce_cartpole.csv')
trail_results[x] = actor_critic_mc(lr, l, baseparams, eps, epoch=epoch, base=base) # (epoch, ) std_error = np.std(trail_results, axis=0) mean_rewards = np.mean(trail_results, axis=0) return mean_rewards, std_error def draw_plot(data, error, epoch=100, filename='tests.png'): plt.xlabel('episode') plt.ylabel('reward') plt.plot(np.array(range(epoch)), data, 'k') plt.fill_between(range(epoch), data - error, data + error, alpha=0.3) plt.savefig(filename, dpi=200) plt.show() rewards, error = actor_critic_grid_trail(0.1, lambda x: 2, trail=100) ah.save_cp_csvdata(rewards, error, 'ac_grid.csv') draw_plot(rewards, error, filename='ac_grid.png') rewards, err = actor_critic_mc_trail(1e-2, 0.8, {'order': 5}, lambda x: 0.25, trail=100) ah.save_cp_csvdata(rewards, err, 'ac_mc.csv') draw_plot(rewards, err, filename='ac_mc.png')