mp.set_start_method('spawn') print('Stationary greedy started...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: def func(x): return np.array(pool.map(EpsGreedy(eps=x).rews_opts_stat, args)) result = [func(eps) for eps in epsilons] # get 3 (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # get the average rewards rewards = [pair[:, 0, :].mean(axis=0) for pair in result] # get the percentage of the optimal actions optimals = [Bandit.percent(pair[:, 1, :]) for pair in result] # plotting colors = ('green', 'blue', 'red') labels = (r'$\varepsilon=0$ (greedy)', r'$\varepsilon=0.1$', r'$\varepsilon=0.01$') Bandit.plot(rewards, labels, 'Average reward') Bandit.plot(optimals, labels, '% Optimal action') plt.show()
print(f'done in {round(t2 - t1, 3)} sec') t3 = time.perf_counter() print(f'Overall execution time {round(t3 - t0, 3)} sec') # plotting # labels and colors labels = (r'$\varepsilon$-greedy, $\varepsilon$', 'constant step\n' r'$\varepsilon$-greedy $\alpha=0.1$, $\varepsilon$', r'gradient bandit, $\alpha$', r'UCB, $c$', 'optimistic greedy\n' r'$\alpha=0.1, Q_0$') ylabel = 'Average reward over\n last 100 000 steps' xlabel = r'$\varepsilon, \alpha, c, Q_0$' colors = ('red', 'purple', 'green', 'blue', 'black') # x axis values to correspond with parameter slices x = [ list(range(10)[start:stop]) for (start, stop) in param_slices.values() ] # plots ax = Bandit.plot(rewards.values(), labels, ylabel, datax=x, xlabel=xlabel, colors=colors, fig_size=(15, 8)) plt.xticks(range(10), x_ticks) plt.show()
runs = int(2e3) # the number of different bandit experiments steps = int(1e3) # number of learning iterations in a single experiment args = [steps] * runs # comment this line if run on windows or OS X (default method) mp.set_start_method('spawn') print('Start upper confidence bound...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: ucb = np.array(pool.map(UCB(c=2).rewards_stat, args)) greedy = np.array(pool.map(EpsGreedy(eps=0.1).rewards_stat, args)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # get the averages ucb = ucb.mean(axis=0) greedy = greedy.mean(axis=0) # plot labels = (r'UCB, $c=2$', r'$\varepsilon$-greedy, $\varepsilon=0.1$') Bandit.plot((ucb, greedy), labels, 'Average reward', colors=('blue', 'grey')) plt.show()
bl01 = np.array( pool.map( GradientBaseline(true_value=4, alpha=0.1).optimals_stat, args)) bl04 = np.array( pool.map( GradientBaseline(true_value=4, alpha=0.4).optimals_stat, args)) no_bl01 = np.array( pool.map( GradientNoBaseline(true_value=4, alpha=0.1).optimals_stat, args)) no_bl04 = np.array( pool.map( GradientNoBaseline(true_value=4, alpha=0.4).optimals_stat, args)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') result = [bl01, bl04, no_bl01, no_bl04] # get percentages result = [Bandit.percent(i) for i in result] # plotting labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$', r'without baseline, $\alpha=0.1$', r'without baseline, $\alpha=0.4$') colors = ('blue', 'cornflowerblue', 'sienna', 'tan') Bandit.plot(result, labels, '% Optimal action', colors=colors) plt.show()