def fig2_3(): steps = int(1e3) runs = int(2e3) # comment this line if run on windows or OS X (default method) mp.set_start_method('spawn') print('Optimistic vs realistic started...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: real = np.array(pool.starmap(realistic, [(steps, 0.1, 0.1)] * runs)) opt = np.array(pool.starmap(optimistic, [(steps, 0.1)] * runs)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # percentage of optimal actions real = percent(real) opt = percent(opt) # plotting labels = ('Realistic, greedy\n' r'$Q_1=0, \varepsilon=0$', r'Optimistic, $\varepsilon$-greedy' '\n' r'$Q_1=5, \varepsilon=0.1$') plot((real, opt), labels, '% Optimal action', colors=('grey', 'dodgerblue')) plt.show()
def fig2_5(): runs = int(2e3) steps = int(1e3) # comment this line if run on windows or OS X (default method) mp.set_start_method('spawn') print('Started gradient bandit...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: bl01 = np.array(pool.starmap(grad_bline, [(steps, 0.1)] * runs)) bl04 = np.array(pool.starmap(grad_bline, [(steps, 0.4)] * runs)) no_bl01 = np.array(pool.starmap(grad_no_bline, [(steps, 0.1)] * runs)) no_bl04 = np.array(pool.starmap(grad_no_bline, [(steps, 0.4)] * runs)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') result = [bl01, bl04, no_bl01, no_bl04] # get percentages result = [percent(i) for i in result] # plotting labels = (r'with baseline, $\alpha=0.1$', r'with baseline, $\alpha=0.4$', r'without baseline, $\alpha=0.1$', r'without baseline, $\alpha=0.4$') colors = ('blue', 'cornflowerblue', 'sienna', 'tan') plot(result, labels, '% Optimal action', colors=colors) plt.show()
def fig2_4(): runs = int(2e3) # the number of different bandit experiments steps = int(1e3) # number of learning iterations in a single experiment # comment this line if run on windows or OS X (default method) mp.set_start_method('spawn') print('Start upper confidence bound...') t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: rewards_ucb = np.array(pool.starmap(ucb, [(steps, 2)] * runs)) rewards_greedy = np.array( pool.starmap(eps_greedy, [(steps, 0.1)] * runs)) t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # get the averages rewards_ucb = rewards_ucb.mean(axis=0) rewards_greedy = rewards_greedy.mean(axis=0) # plot labels = (r'UCB, $c=2$', r'$\varepsilon$-greedy, $\varepsilon=0.1$') plot((rewards_ucb, rewards_greedy), labels, 'Average reward', colors=('blue', 'grey')) plt.show()
print(f'{x}', end=' ') # mean reward across all runs arr = np.array(pool.starmap(locals()[method], [(steps, param)] * runs)).mean(axis=0) # overall mean reward for the last 100 000 steps rewards[method].append(arr[100000:].mean()) t2 = time.perf_counter() print(f'done in {round(t2 - t1, 3)} sec') t3 = time.perf_counter() print(f'Overall execution time {round(t3 - t0, 3)} sec') # plotting # labels and colors labels = (r'$\varepsilon$-greedy, $\varepsilon$', 'constant step\n' r'$\varepsilon$-greedy $\alpha=0.1$, $\varepsilon$', r'gradient bandit, $\alpha$', r'UCB, $c$', 'optimistic greedy\n' r'$\alpha=0.1, Q_0$') ylabel = 'Average reward over\n last 100 000 steps' xlabel = r'$\varepsilon, \alpha, c, Q_0$' colors = ('red', 'purple', 'green', 'blue', 'black') # x axis values to correspond with parameter slices x = [list(range(10)[start:stop]) for (start, stop) in param_slices.values()] # plots ax = plot(rewards.values(), labels, ylabel, datax=x, xlabel=xlabel, colors=colors, fig_size=(15, 8)) plt.xticks(range(10), x_ticks) plt.show()
t1 = time.perf_counter() with mp.Pool(mp.cpu_count()) as pool: sample_av = np.array( pool.starmap(sample_average, [(steps, 0.1)] * runs)) const_step = np.array( pool.starmap(constant_step, [(steps, 0.1, 0.1)] * runs)) # got (2000, 2, 1000)-shaped arrays, axis=1 stands for rewards and optimals t2 = time.perf_counter() print(f'Done in {round(t2 - t1, 3)} sec') # reshape the arrays to distinguish rewards and optimal actions sample_av = np.transpose(sample_av, (1, 0, 2)) const_step = np.transpose(const_step, (1, 0, 2)) # get average rewards rewards = (sample_av[0].mean(axis=0), const_step[0].mean(axis=0)) # get optimal action percentage optimals = (percent(sample_av[1]), percent(const_step[1])) # plot labels = ('Sample average\n' r'$\varepsilon=0.1$', 'Constant step-size\n' r'$\varepsilon=0.1, \alpha=0.1$') plot(rewards, labels, 'Average reward') plot(optimals, labels, '% Optimal action') plt.show()