def main(): # arm1 = NormalArm(0.2, 1) # arm2 = NormalArm(0.3, 1) # arm3 = NormalArm(0.4, 1) # arm4 = NormalArm(0.5, 1) # arm5 = NormalArm(0.6, 1) # arm6 = NormalArm(0.7, 1) # arm7 = NormalArm(0.6, 1) # arm8 = NormalArm(0.5, 1) # arm9 = NormalArm(0.4, 1) # arm10 = NormalArm(0.1, 1) arm1 = BernoulliArm(0.2) arm2 = BernoulliArm(0.5) arm3 = BernoulliArm(0.9) arm4 = BernoulliArm(0.4) arm5 = BernoulliArm(0.4) arm6 = BernoulliArm(0.3) arm7 = BernoulliArm(0.2) arm8 = BernoulliArm(0.1) arms = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8] max_mu = max([arm.mu for arm in arms]) n_arms = len(arms) print(f'Optimal arm: #{argmax([arm.mu for arm in arms]) + 1}') param_dict = {"epsilon": 0.5, "sigma": 1.96, "tau": 0.2, "gamma": 0.2} algo_epsilon = EpsilonGreedy(n_arms, param_dict) algo_anneal_epsilon = AnnealingEpsilonGreedy(n_arms, param_dict) algo_ucb1 = UCB1(n_arms, param_dict) algo_ucb_bayesian = UCB_Bayesian(n_arms, param_dict) # 95% confident algo_softmax = Softmax(n_arms, param_dict) algo_anneal_softmax = AnnealingSoftmax(n_arms, param_dict) algo_exp3 = Exp3(n_arms, param_dict) algo_thompson = ThompsonSampling(n_arms, param_dict) algorithms = [algo_ucb1] algorithm_rewards = [ ] # 2D list[algo][t] (array of running avg. rewards for each algo at time-step t) algorithm_cum_rewards = [ ] # 2D list[algo][t] (array of cumulative rewards for each algo at time-step t) algorithm_arm_selections = [ ] # 2D list[algo][t] (array of arm selections for each algo at time-step t) # semi-global variables timesteps = 1000 # number of time-steps (T) total_iteration = 1 # outer-loop algorithm_timestep_reward_stacked = np.zeros((timesteps), dtype=int) for algo in algorithms: print(algo.get_name()) avg_rewards, cum_rewards, arm_selections = [0], [0], [] new_avg = 0 for i in range(total_iteration): # TODO: reinitialize this DYNAMICALLY!! algo = UCB1( n_arms, param_dict) # reinitialize algorithm (clear previous memory) for t in range(timesteps): # NOTE: 0 based? 1 based? chosen_arm = algo.select_arm() arm_selections.append(chosen_arm + 1) # convert 0-based index to 1-based reward = arms[chosen_arm].draw_reward() algorithm_timestep_reward_stacked[ t] += reward # This persists over total_iterations if t != 0: new_avg = (avg_rewards[-1] * (t - 1) + reward) / t # new running avg. avg_rewards.append(new_avg) cum_rewards.append( new_avg * t) # [1,1,0,1,1] [1,2,2,3,?], cur_avg=0.75, new_avg = 0.8 algo.update(chosen_arm, reward) # Resetting variable arm_selections = [] algorithm_rewards.append(avg_rewards) algorithm_cum_rewards.append(cum_rewards) algorithm_arm_selections.append(arm_selections) # Compute average rewards for each iteration average_reward_in_each_round = np.zeros(timesteps, dtype=float) # Calculate the values for one good 1000 rounds # Squash 200X1000 -> 1X1000 for t in range(timesteps): average_reward_in_each_round[t] = float( algorithm_timestep_reward_stacked[t]) / float(total_iteration) cumulative_optimal_reward = 0.0 cumulative_reward = 0.0 x_axis = np.zeros(timesteps, dtype=int) regrets = np.zeros(timesteps, dtype=float) # regret for each round # print(average_reward_in_each_round) for t in range(timesteps): x_axis[t] = t cumulative_optimal_reward += max_mu cumulative_reward += average_reward_in_each_round[t] # print(f"{cumulative_optimal_reward} \t {cumulative_reward}") regrets[t] = cumulative_optimal_reward - cumulative_reward plot_regret(x_axis, regrets, cumulative_optimal_reward, cumulative_reward, average_reward_in_each_round, timesteps, algo.get_name()) print( f"The average regret for {algo.get_name()} is {cumulative_optimal_reward - cumulative_reward}" ) max_cum_reward = max( [algorithm_cum_rewards[i][-1] for i in range(len(algorithms))]) for i in range(len(algorithms)): print( f"{algorithms[i].get_name()}: {algorithm_cum_rewards[i][-1]:.2f}") plot_graph(timesteps, arms, algorithms, algorithm_rewards, algorithm_cum_rewards, algorithm_arm_selections, max_mu, max_cum_reward) plot_cum_rewards(algorithms, algorithm_cum_rewards, timesteps, max_cum_reward)
# -*- coding: utf-8 -*- import matplotlib.pyplot as plt from algorithms.softmax import Softmax from arms.bernoulli import BernoulliArm from tests.test_framework import test_algorithm algo = Softmax(0.1, 5) means = [0.1, 0.1, 0.1, 0.1, 0.9] arms = map(lambda mu: BernoulliArm(mu), means) times, chosen_arms, rewards, cumulative_rewards = test_algorithm(algo, arms, 500) # accuracy of the Epsilon Greedy Algorithm best_arms = [0.0 for _ in range(len(times))] for t in times: if chosen_arms[t-1] == 4: if t == 1: best_arms[t-1] = 1.0 else: best_arms[t-1] = 1.0 * (best_arms[t-2] * (t-1) + 1) / t else: if t == 1: best_arms[t-1] = 0.0 else: best_arms[t-1] = 1.0 * best_arms[t-2] * (t-1) / t plt.subplot(221) plt.plot(times, best_arms) plt.grid() # Performance of the Epsilon Greedy Algorithm average_rewards = [0.0 for _ in range(len(times))] for t in times:
def main(): # arm1 = NormalArm(0.2, 1) # arm2 = NormalArm(0.3, 1) # arm3 = NormalArm(0.4, 1) # arm4 = NormalArm(0.5, 1) # arm5 = NormalArm(0.6, 1) # arm6 = NormalArm(0.7, 1) # arm7 = NormalArm(0.6, 1) # arm8 = NormalArm(0.5, 1) # arm9 = NormalArm(0.4, 1) # arm10 = NormalArm(0.1, 1) arm1 = BernoulliArm(0.2) arm2 = BernoulliArm(0.5) arm3 = BernoulliArm(0.9) arm4 = BernoulliArm(0.4) arm5 = BernoulliArm(0.4) arm6 = BernoulliArm(0.3) arm7 = BernoulliArm(0.2) arm8 = BernoulliArm(0.1) arms = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8] n_arms = len(arms) print(f'Optimal arm: #{argmax([arm.mu for arm in arms]) + 1}') change_of_distribution = False algo_epsilon = EpsilonGreedy(0.05, n_arms) algo_anneal_epsilon = AnnealingEpsilonGreedy(n_arms) algo_ucb1 = UCB1(n_arms) algo_ucb_bayesian = UCB_Bayesian(1.96, n_arms) # 95% confident algo_softmax = Softmax(.2, n_arms) algo_anneal_softmax = AnnealingSoftmax(n_arms) algo_exp3 = Exp3(.2, n_arms) algo_thompson = ThompsonSampling(n_arms) algorithms = [algo_epsilon, algo_ucb1, algo_thompson] algorithm_rewards = [ ] # 2D list[algo][t] (array of running avg. rewards for each algo at time-step t) algorithm_cum_rewards = [ ] # 2D list[algo][t] (array of cumulative rewards for each algo at time-step t) algorithm_arm_selections = [ ] # 2D list[algo][t] (array of arm selections for each algo at time-step t) timesteps = 5000 # number of time-steps for algo in algorithms: avg_rewards, cum_rewards, arm_selections = [0], [0], [] for t in range(1, timesteps): if change_of_distribution and t == timesteps / 2: # change distribution of rewards at half-time arms = change_distribution() print(f'Optimal arm: {argmax([arm.mu for arm in arms]) + 1}') algo.initialize(len(arms)) chosen_arm = algo.select_arm() arm_selections.append(chosen_arm + 1) # convert 0-based index to 1-based reward = arms[chosen_arm].draw_reward() new_avg = (avg_rewards[-1] * (t - 1) + reward) / t # new running avg. avg_rewards.append(new_avg) cum_rewards.append(new_avg * t) algo.update(chosen_arm, reward) algorithm_rewards.append(avg_rewards) algorithm_cum_rewards.append(cum_rewards) algorithm_arm_selections.append(arm_selections) max_mu = max([arm.mu for arm in arms]) max_cum_reward = max( [algorithm_cum_rewards[i][-1] for i in range(len(algorithms))]) for i in range(len(algorithms)): print(algorithms[i].get_name() + ":", f'{algorithm_cum_rewards[i][-1]:.2f}') plot_graph(timesteps, arms, algorithms, algorithm_rewards, algorithm_cum_rewards, algorithm_arm_selections, max_mu, max_cum_reward, change_of_distribution) plot_cum_rewards(algorithms, algorithm_cum_rewards, timesteps, max_cum_reward)