class UCB(): def __init__(self, models, n, alpha_ucb, task=None): self.models = models self.n = n self.alpha = alpha_ucb self.num_arms = models.shape[1] self.num_models = models.shape[0] self.counts = np.zeros([self.num_arms]) self.means = np.zeros([self.num_arms]) if task is None: self.task = np.random.choice(self.num_models) else: self.task = task self.bandit = Bandit(self.models[self.task]) def select_arm_ucb(self, t): for a in range(self.num_arms): if self.counts[a] == 0: return a ucb_values = self.means + np.sqrt( (self.alpha * np.log(t)) / self.counts) return np.argmax(ucb_values) def update_arm(self, action, reward): self.counts[action] += 1 n = self.counts[action] value = self.means[action] self.means[action] = ((n - 1) / n) * value + (1. / n) * reward def run(self, T_list): regrets = [] counts = np.zeros([max(T_list), self.num_arms]) for t in range(self.n): action = self.select_arm_ucb(t) reward = self.bandit.pull_arm(action) self.update_arm(action, reward) if (t + 1) in T_list: regret = self.bandit.calculate_regret( self.counts) # keep track of regrets regrets.append(regret) counts[t, :] = self.counts return regrets, np.asarray(counts)
class HypoTest(): def __init__(self, models, n, alpha, beta, task=None): self.models = models self.n = n self.num_arms = models.shape[1] self.num_models = models.shape[0] self.counts = np.zeros([self.num_arms]) self.means = np.zeros([self.num_arms]) if task is None: self.task = np.random.choice(self.num_models) else: self.task = task self.bandit = Bandit(self.models[self.task]) self.alpha = alpha self.beta = beta def get_num_pulls(self, arm): num = norm.ppf(self.beta) - norm.ppf(1.- self.alpha) den = np.abs(np.min(self.models[:, arm]) - np.max(self.models[:, arm])) return (num / den)**2 def calculate_c(self, arm, num_pulls): c = np.min(self.models[:, arm]) + 1. / np.sqrt(num_pulls)*norm.ppf(1.-self.alpha) return c def calculate_power(self, arm, num_pulls, c): beta = norm.cdf(np.sqrt(num_pulls)*(c - self.models[1, arm])) return beta def update(self, arm_t, r): self.counts[arm_t] += 1 n = self.counts[arm_t] value = self.means[arm_t] self.means[arm_t] = ((n-1)/n) * value + (1./n) * r def run(self, arm, num_pulls = None, c = None): if num_pulls is not None: num_pulls = np.ceil(self.get_num_pulls(arm)) c = self.calculate_c(arm, num_pulls) action = arm total = 0 regrets = [] for t in range(self.n): if t < num_pulls: reward = self.bandit.pull_arm(action) total += reward elif t == num_pulls: average = total / num_pulls # print(average) # import ipdb; ipdb.set_trace() if average > c: action = np.argmax(self.models[:, arm]) else: action = np.argmin(self.models[:, arm]) reward = self.bandit.pull_arm(action) else: reward = self.bandit.pull_arm(action) self.update(action, reward) # if t % 100 == 0 and t > 0: # regret = self.bandit.calculate_regret(self.counts) # regrets.append(regret) # return regrets regret = self.bandit.calculate_regret(self.counts) return regret
class mUCB(): def __init__(self, models, n, alpha_eps, task=None): self.models = models self.n = n self.delta = 1. / self.n self.alpha_eps = alpha_eps self.num_arms = models.shape[1] self.num_models = models.shape[0] self.counts = np.zeros([self.num_arms]) self.means = np.zeros([self.num_arms]) if task is None: self.task = np.random.choice(self.num_models) else: self.task = task self.bandit = Bandit(self.models[self.task]) # calculate confidence epsilon_i,t def calculate_e(self): eps = np.sqrt( np.log(self.num_models * self.n**self.alpha_eps / self.delta) / (2 * self.counts)) return eps # calculate set of compatible models Theta_t def get_Theta_t(self, eps): indices = [] for i in range(self.num_models): model = self.models[i] check = True for j in range(self.num_arms): if np.abs(model[j] - self.means[j]) > eps[j]: check = False if check == True: indices.append(i) return self.models[indices, :] # get arm with highest reward def get_arm_t(self, Theta_t): index = np.unravel_index(Theta_t.argmax(), Theta_t.shape) return index[1] # update empirical estimates def update(self, arm_t, r): self.counts[arm_t] += 1 n = self.counts[arm_t] value = self.means[arm_t] self.means[arm_t] = ((n - 1) / n) * value + (1. / n) * r def run(self, T_list): regrets = [] counts = np.zeros([max(T_list), self.num_arms]) # pull each arm once for t in range(self.num_arms): r = self.bandit.pull_arm(t) self.update(t, r) counts[t] = self.counts # loop through mUCB algorithm for t in range(self.num_arms, self.n): eps = self.calculate_e() # calculate confidence Theta_t = self.get_Theta_t(eps) # get compaitble models if len(Theta_t) == 0: return False, np.asarray(counts) else: arm_t = self.get_arm_t(Theta_t) # get arm with maximum reward r = self.bandit.pull_arm(arm_t) # pull the arm self.update(arm_t, r) if (t + 1) in T_list: regret = self.bandit.calculate_regret( self.counts) # keep track of regrets regrets.append(regret) counts[t, :] = self.counts return regrets, np.asarray(counts)
class StructuredBandit(): def __init__(self, models, n, a, task=None): self.models = models self.n = n self.a = a self.num_arms = models.shape[1] self.num_models = models.shape[0] self.counts = np.zeros([self.num_arms]) self.means = np.zeros([self.num_arms]) if task is None: self.task = np.random.choice(self.num_models) else: self.task = task self.bandit = Bandit(self.models[self.task]) def C_theta(self, theta): i_star = np.argmax(theta) i_list = [] for m in range(len(self.models)): j_star = np.argmax(self.models[m]) if i_star != j_star: i_list.append(np.square(self.models[m] - theta)) return 0.5 * np.array(i_list) def get_alpha(self, theta): d_theta = np.max(theta) - theta i_list = self.C_theta(theta) alpha = cp.Variable(self.num_arms, integer=True) objective = cp.Minimize(d_theta * alpha) constraints = [ alpha >= 0, i_list * alpha >= 1, ] prob = cp.Problem(objective, constraints) prob.solve() #print(prob.value) return np.array(alpha.value) def get_constraints(self, theta): d_theta = np.max(theta) - theta i_list = self.C_theta(theta) alpha = cp.Variable(self.num_arms) objective = cp.Minimize(0) constraints = [ alpha >= 0, i_list * alpha >= 1, ] prob = cp.Problem(objective, constraints) prob.solve() return np.array(alpha.value) def run(self, T_list, method=None): counts = np.zeros([self.num_arms]) totals = np.zeros([self.num_arms]) regrets = [] # means = np.zeros([num_arms]) n_e = 0 temp = [0, 0, 0] counts_all = np.zeros([max(T_list), self.num_arms]) exps = np.zeros([self.num_models]) for t in range(self.n): if t < self.num_arms: reward = self.bandit.pull_arm(t) exps += np.square(self.models[:, t] - reward) totals[t] += reward counts[t] += 1 else: if method == 'mle': # average = np.divide(totals, counts) # mle = np.argmin(np.linalg.norm(self.models - average, axis=1)) # estimate = self.models[mle] # exps = [] # for m in range(num_modes): # mode = models[m] # means = mode[arms[:t]] # exp = np.sum(np.square(rewards[:t] - means)) # exps.append(exp) # estimate = models[np.argmin(exps)] estimate = self.models[np.argmin(exps)] else: estimate = np.divide(totals, counts) # line 6 i_list = self.C_theta(estimate) test = counts / (self.a * np.log(t)) if np.all(np.dot(i_list, test) >= 1): arm = np.argmax(estimate) temp[0] += 1 # line 9 else: # line 10 if np.min(counts) < ( n_e / (2 * self.num_arms) ): # (np.sqrt(n_e)/ (2*self.num_arms)): arm = np.argmin(counts) temp[1] += 1 # line 12 else: alpha = self.get_alpha(estimate) if alpha.size == self.num_arms: indices = np.where(test < alpha) arm = random.choice(indices)[0] else: # constraints = self.get_constraints(estimate) # if constraints.size == self.num_arms: # constraints = np.round(constraints) # indices = np.where(test < constraints) # arm = random.choice(indices)[0] # else: # arm = random.choice(range(self.num_arms)) arm = random.choice(range(self.num_arms)) temp[2] += 1 n_e += 1 reward = self.bandit.pull_arm(arm) exps += np.square(self.models[:, arm] - reward) totals[arm] += reward counts[arm] += 1 if (t + 1) in T_list: regret = self.bandit.calculate_regret( counts) # keep track of regrets regrets.append(regret) counts_all[t, :] = counts # print('estimate: ', estimate) # print('counts: ', counts) # print('algo: ', temp) # regret = self.bandit.calculate_regret(counts) # return regret, counts return regrets, np.asarray(counts_all)