def plot_suboptimal_arm(self): self.ph.initiate_figure("#Pulls of sub-optimal arms vs Time", "Time T", "#Pulls", x_log=False, y_log=True) for col in range(self.K): # For each arm, theoretical_bound = self.theoretical_bounds_arm_pulls[:, col] if col != self.best_arm: self.ph.add_curve(theoretical_bound, mh.stringify(self.true_means[col]) + " Theo", col, 0) for i in range(self.algo_count): # For each bandit algorithm, empirical_pulls = self.cum_pulls[i][:, col] self.ph.add_curve( empirical_pulls, mh.stringify(self.true_means[col]) + self.algorithms_to_run[i][0], col, i + 1) self.ph.plot_curves()
def analyse_suboptimal_arm_pulls(self): # Compute deltas and theoretical upper bound of playing each sub-optimal arm. self.best_arm = mh.get_maximum_index(self.true_means) mean_of_best_arm = self.true_means[self.best_arm] for i in range(self.K): self.deltas[i] = mean_of_best_arm - self.true_means[i] del_sq_invs = mh.get_instance_dependent_square_inverses( self.deltas, self.best_arm) addi_constant = rvh.func_of_pi(add=1, power=2, mult=1 / 3) time_series = np.arange(self.T + 1) logarithmic_time_series = rvh.natural_logarithm(time_series) a = np.array(del_sq_invs) del_sq_inv_row_matrix = np.reshape(a, (1, -1)) logarithmic_time_series_column_matrix = np.reshape( logarithmic_time_series, (-1, 1)) matrix = np.dot(logarithmic_time_series_column_matrix, del_sq_inv_row_matrix) self.theoretical_bounds_arm_pulls = matrix + addi_constant
def play_arms(self): rewards = [0] n = 0 # At time t = 0, for i in range(1, self.K + 1): arm_number = i - 1 reward = super().pull_arm(arm_number) rewards.append(reward) n = n + 1 # From time t = 1 for t in range(1, mh.ciel_root(self.N) + 1): self.revise_ucbs(n) # pull the arm with highest UCB 2t-1 times pulls_this_iteration = 2 * t - 1 arm_with_highest_ucb = mh.get_maximum_index( self.upper_confidence_bound) for i in range(pulls_this_iteration): if n >= self.N: break reward = super().pull_arm(arm_with_highest_ucb) rewards.append(reward) n = n + 1 # end for # end for return rewards
def plot_regret(self): self.ph.clear_curves() true_means_string = "True means of arms: " + mh.stringify_list( self.true_means) self.ph.initiate_figure("Regret of algorithms vs Time\n" + true_means_string, "Time T", "Regret", x_log=False, y_log=False) # ph.add_curve(self.cum_optimal_reward, "Optimal Reward", 1) # ph.add_curve(self.cum_reward_empirical, "Empirical Reward", 2) # ph.add_curve(self.cum_reward_empirical_incremental, "Empirical Reward" incremental, 3) self.ph.add_curve(self.cum_regret_theo_bound, "Theoretical Upper Bound", 4) for i in range(self.algo_count): # For each bandit algorithm, self.ph.add_curve(self.cum_regret_empirical[i], self.algorithms_to_run[i][0], 5 + i) self.ph.plot_curves()
def analyse_common_stats(self): # Compute deltas and theoretical upper bound of regret of UCB1. self.best_arm = mh.get_maximum_index(self.true_means) mean_of_best_arm = self.true_means[self.best_arm] for i in range(self.K): self.deltas[i] = mean_of_best_arm - self.true_means[i] sum_del_inv, sum_del = mh.get_instance_dependent_values( self.best_arm, self.deltas) mult_constant, addi_constant = mh.get_theoretical_constants( sum_del_inv, sum_del) time_series = np.arange(self.T + 1) self.cum_regret_theo_bound = mult_constant * rvh.natural_logarithm( time_series) + addi_constant self.cum_optimal_reward = time_series * mean_of_best_arm
def __init__(self, k=10, t=10**6): self.ph = PlotHelper() self.logger = LogHelper.get_logger(__name__) # Set the parameters of number of arms, time horizon arbitrarily. self.K = k self.T = t self.deltas = [0] * self.K # Create the arms self.true_means, self.arms = mh.get_arms(self.K, self.T)
def play_arms(self): rewards = [0] for t in range(1, self.K + 1): arm_number = t-1 reward = super().pull_arm(arm_number) rewards.append(reward) for t in range(self.K + 1, self.T + 1): self.revise_ucbs(t) # pull the arm with highest UCB arm_with_highest_ucb = mh.get_maximum_index(self.upper_confidence_bound) reward = super().pull_arm(arm_with_highest_ucb) rewards.append(reward) return rewards
def revise_ucbs(self, t): for i in range(self.K): self.upper_confidence_bound[i] = mh.textbook_radius(t, self.arms[i].pull_count) + \ self.arms[i].empirical_mean
def test_get_arms(self): true_means, arms = mh.get_arms(10, 100) best_arm = mh.get_maximum_index(true_means)