def __init__(self, m, means): self.m = m self.K = len(means) self.mu_hat = np.zeros(self.K) self.T = np.zeros(self.K) self.means = means self.bandit = BernoulliBandit(means=self.means)
def __init__(self, m, means, delta): self.m = m self.k = len(means) self.mu_hat = np.zeros(self.k) self.UCB = np.zeros(self.k) self.T = np.zeros(self.k) self.means = means self.delta = delta self.bandit = BernoulliBandit(means=self.means)
class KL_UCB(object): def __init__(self, m, means): self.m = m self.k = len(means) self.mu_hat = np.zeros(self.k) self.estimate = np.zeros(self.k) self.T = np.zeros(self.k) self.means = means self.bandit = BernoulliBandit(means=self.means) def run_KL_UCB(self, n): for t in range(n): for i in range(self.k): if self.T[i] == 0: self.estimate[i] = 10**10 else: thresh = np.log2(f(t)) / self.T[i] self.estimate[i] = -1 for j in range(100): mu_tilde = j / 100 tmp = d(self.mu_hat[i], mu_tilde) if tmp <= thresh: self.estimate[i] = mu_tilde a_t = np.argmax(self.estimate) reward_t = self.bandit.pull(a_t) self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t) / (self.T[a_t] + 1) self.T[a_t] += 1 def update_mean(self, a_t, reward_t): self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t) / (self.T[a_t] + 1)
class Explore_then_commit(object): def __init__(self, m, means): self.m = m self.K = len(means) self.mu_hat = np.zeros(self.K) self.T = np.zeros(self.K) self.means = means self.bandit = BernoulliBandit(means=self.means) def run_ETC(self, n): for t in range(n): if t <= self.m * self.K: a_t = (t % self.K) else: a_t = np.argmax(self.mu_hat) reward_t = self.bandit.pull(a_t) if t <= self.m * self.K: self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t) / (self.T[a_t] + 1) self.T[a_t] += 1
class Minimax_optimal_Strategy_in_stochastic_case(object): def __init__(self, m, means): self.m = m self.K = len(means) self.mu_hat = np.zeros(self.K) self.estimate = np.zeros(self.K) self.T = np.zeros(self.K) self.means = means self.bandit = BernoulliBandit(means=self.means) def run_Moss(self, n): for t in range(n): for i in range(self.K): if self.T[i] == 0: self.estimate[i] = 10 ** 10 else: self.estimate[i] = self.mu_hat[i] + np.sqrt(4*log_star(N/self.K * self.T[i])/self.T[i]) a_t = np.argmax(self.estimate) reward_t = self.bandit.pull(a_t) self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t)/(self.T[a_t] + 1) self.T[a_t] += 1
class Upper_Confidence_bound(object): def __init__(self, m, means, delta): self.m = m self.k = len(means) self.mu_hat = np.zeros(self.k) self.UCB = np.zeros(self.k) self.T = np.zeros(self.k) self.means = means self.delta = delta self.bandit = BernoulliBandit(means=self.means) def run_UCB(self, n): for t in range(n): a_t = np.argmax(self.UCB) reward_t = self.bandit.pull(a_t) for i in range(self.k): if self.T[i] == 0: self.UCB[i] = 10**10 else: self.UCB[i] = self.mu_hat[i] + np.sqrt( 2 * np.log2(1 / self.delta) / self.T[i]) self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t) / (self.T[a_t] + 1) self.T[a_t] += 1