def ucb(mean_rewards, horizon): num_arms = len(mean_rewards) calculated_means = [0] * num_arms num_pulls = [0] * num_arms ucb_list = [0] * num_arms expected_reward = 0 # sampling each arm once for arm in range(num_arms): reward = pull_arm(mean_rewards[arm]) num_pulls[arm] += 1 calculated_means[arm] = reward for i in range(num_arms, horizon): # calculating the UCBs for each arm for arm in range(num_arms): p = calculated_means[arm] u = num_pulls[arm] ucb_list[arm] = p + np.sqrt(2 * np.log(i) / u) # picking the arm with maximum UCB arm_idx = np.argmax(ucb_list) reward = pull_arm(mean_rewards[arm_idx]) expected_reward += reward calculated_means[arm_idx] = (num_pulls[arm_idx] * calculated_means[arm_idx] + reward) / (num_pulls[arm_idx] + 1) num_pulls[arm_idx] += 1 ideal_reward = max(mean_rewards) * horizon regret = round(ideal_reward - expected_reward, 3) return regret
def thompson_sampling(mean_rewards, horizon): num_arms = len(mean_rewards) expected_reward = 0 successes = [0] * num_arms failures = [0] * num_arms betas = [0] * num_arms for i in range(horizon): for arm in range(num_arms): arm_success = successes[arm] arm_failure = failures[arm] # picks a number from the Beta distribution with # alpha = arm_success + 1, beta = arm_failure + 1 betas[arm] = np.random.beta(arm_success + 1, arm_failure + 1) # picks an arm with the maximum Beta value arm_idx = np.argmax(betas) reward = pull_arm(mean_rewards[arm_idx]) if (reward == 0): # failure occurs with reward 0 failures[arm_idx] += 1 else: # success occurs with reward 1 successes[arm_idx] += 1 expected_reward += reward ideal_reward = max(mean_rewards) * horizon regret = round(ideal_reward - expected_reward, 3) return regret
def round_robin(mean_rewards, horizon): # pulls all arms in a round-robin manner num_arms = len(mean_rewards) expected_reward = 0 for i in range(horizon): arm_idx = i % num_arms reward = pull_arm(mean_rewards[arm_idx]) expected_reward += reward ideal_reward = max(mean_rewards) * horizon regret = round(ideal_reward - expected_reward, 3) return regret
def epsilon_greedy(mean_rewards, epsilon, horizon): num_arms = len(mean_rewards) calculated_means = [0] * num_arms num_pulls = [0] * num_arms expected_reward = 0 for i in range(horizon): arm_idx = 0 # makes a decision for which arm to pull decision = np.random.choice([1, 2], p = [epsilon, 1 - epsilon]) if (decision == 1): # randomly pick an arm to pull arm_idx = mean_rewards.index(np.random.choice(mean_rewards)) else: # pick arm with max mean arm_idx = np.argmax(calculated_means) reward = pull_arm(mean_rewards[arm_idx]) expected_reward += reward # calculates the new mean for the pulled arm calculated_means[arm_idx] = (num_pulls[arm_idx] * calculated_means[arm_idx] + reward) / (num_pulls[arm_idx] + 1) num_pulls[arm_idx] += 1 ideal_reward = max(mean_rewards) * horizon regret = round(ideal_reward - expected_reward, 3) return regret