def IDSAction(self, delta, g): """ Implementation of IDSAction algorithm as defined in Russo & Van Roy, p. 242 :param delta: np.array, instantaneous regrets :param g: np.array, information gains :return: int, arm to pull """ Q = np.zeros((self.nb_arms, self.nb_arms)) IR = np.ones((self.nb_arms, self.nb_arms)) * np.inf q = np.linspace(0, 1, 1000) for a in range(self.nb_arms - 1): for ap in range(a + 1, self.nb_arms): if g[a] < 1e-6 or g[ap] < 1e-6: return rd_argmax(-g) da, dap, ga, gap = delta[a], delta[ap], g[a], g[ap] qaap = q[rd_argmax(-(q * da + (1 - q) * dap)**2 / (q * ga + (1 - q) * gap))] IR[a, ap] = (qaap * (da - dap) + dap)**2 / (qaap * (ga - gap) + gap) Q[a, ap] = qaap amin = rd_argmax(-IR.reshape(self.nb_arms * self.nb_arms)) a, ap = amin // self.nb_arms, amin % self.nb_arms b = np.random.binomial(1, Q[a, ap]) arm = int(b * a + (1 - b) * ap) if self.store_IDS: self.IDS_results['arms'].append(arm) policy = np.zeros(self.nb_arms) policy[a], policy[ap] = Q[a, ap], (1 - Q[a, ap]) self.IDS_results['policy'].append(policy) self.IDS_results['delta'].append(delta) self.IDS_results['g'].append(g) self.IDS_results['IR'].append( np.inner(delta**2, policy) / np.inner(g, policy)) return arm
def computeVIDS(self, mu_t, sigma_t, M): """ Implementation of linearSampleVIR (algorithm 6 in Russo & Van Roy, p. 244) applied for Linear Bandits with multivariate normal prior. Here integrals are approximated in sampling thetas according to their respective posterior distributions. :param mu_t: np.array, posterior mean vector at time t :param sigma_t: np.array, posterior covariance matrix at time t :param M: int, number of samples :return: int, np.array, arm chose and p* """ thetas = np.random.multivariate_normal(mu_t, sigma_t, M) mu = np.mean(thetas, axis=0) theta_hat = np.argmax(np.dot(self.features, thetas.T), axis=0) theta_hat_ = [thetas[np.where(theta_hat==a)] for a in range(self.n_a)] p_a = np.array([len(theta_hat_[a]) for a in range(self.n_a)])/M if np.max(p_a) >= self.threshold: # Stop learning policy self.optimal_arm = np.argmax(p_a) arm = self.optimal_arm else: mu_a = np.nan_to_num(np.array([np.mean([theta_hat_[a]], axis=1).squeeze() for a in range(self.n_a)])) L_hat = np.sum(np.array([p_a[a]*np.outer(mu_a[a]-mu, mu_a[a]-mu) for a in range(self.n_a)]), axis=0) rho_star = np.sum(np.array([p_a[a]*np.dot(self.features[a], mu_a[a]) for a in range(self.n_a)]), axis=0) v = np.array([np.dot(np.dot(self.features[a], L_hat), self.features[a].T) for a in range(self.n_a)]) delta = np.array([rho_star - np.dot(self.features[a], mu) for a in range(self.n_a)]) arm = rd_argmax(-delta**2/v) return arm, p_a
def TS(self, T): """ Implementation of Thomson Sampling (TS) algorithm for Linear Bandits with multivariate normal prior :param T: int, time horizon :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ arm_sequence, reward = np.zeros(T), np.zeros(T) mu_t, sigma_t = self.initPrior() for t in range(T): theta_t = np.random.multivariate_normal(mu_t, sigma_t, 1).T a_t = rd_argmax(np.dot(self.features, theta_t)) r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t) reward[t], arm_sequence[t] = r_t, a_t return reward, arm_sequence
def BayesUCB(self, T): """ Implementation of Bayesian Upper Confidence Bounds (BayesUCB) algorithm for Linear Bandits with multivariate normal prior :param T: int, time horizon :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ arm_sequence, reward = np.zeros(T), np.zeros(T) mu_t, sigma_t = self.initPrior() for t in range(T): a_t = rd_argmax(np.dot(self.features, mu_t) + norm.ppf(t/(t+1)) * np.sqrt(np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T)))) r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t) reward[t], arm_sequence[t] = r_t, a_t return reward, arm_sequence
def UCB1(self, T, rho): """ Implementation of UCB1 algorithm :param T: int, time horizon :param rho: float, parameter for balancing between exploration and exploitation :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ Sa, Na, reward, arm_sequence = self.init_lists(T) for t in range(T): if t < self.nb_arms: arm = t else: arm = rd_argmax(Sa / Na + rho * np.sqrt(np.log(t + 1) / 2 / Na)) self.update_lists(t, arm, Sa, Na, reward, arm_sequence) return reward, arm_sequence
def GPUCB(self, T): """ Implementation of GPUCB, Srinivas (2010) 'Gaussian Process Optimization in the Bandit Setting: No Regret and Experimental Design' for Linear Bandits with multivariate normal prior :param T: int, time horizon :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ arm_sequence, reward = np.zeros(T), np.zeros(T) mu_t, sigma_t = self.initPrior() for t in range(T): beta_t = 2 * np.log(self.n_a * ((t+1)*np.pi)**2 / 6 / 0.1) a_t = rd_argmax(np.dot(self.features, mu_t) + np.sqrt(beta_t * np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T)))) r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t) reward[t], arm_sequence[t] = r_t, a_t return reward, arm_sequence
def Tuned_GPUCB(self, T, c=0.9): """ Implementation of Tuned GPUCB described in Russo & Van Roy's paper of study for Linear Bandits with multivariate normal prior :param T: int, time horizon :param c: float, tunable parameter. Default 0.9 :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ arm_sequence, reward = np.zeros(T), np.zeros(T) mu_t, sigma_t = self.initPrior() for t in range(T): beta_t = c * np.log(t+1) a_t = rd_argmax(np.dot(self.features, mu_t) + np.sqrt(beta_t*np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T)))) r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t) reward[t], arm_sequence[t] = r_t, a_t return reward, arm_sequence
def MOSS(self, T, rho): """ Implementation of Minimax Optimal Strategy in the Stochastic case (MOSS). :param T: int, time horizon :param rho: float, parameter for balancing between exploration and exploitation :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ Sa, Na, reward, arm_sequence = self.init_lists(T) for t in range(T): if t < self.nb_arms: arm = t else: root_term = np.array( list(map(lambda x: max(x, 1), T / (self.nb_arms * Na)))) arm = rd_argmax(Sa / Na + rho * np.sqrt(4 / Na * np.log(root_term))) self.update_lists(t, arm, Sa, Na, reward, arm_sequence) return reward, arm_sequence
def LinUCB(self, T, lbda=10e-4, alpha=10e-1): """ Implementation of Linear UCB algorithm for Linear Bandits with multivariate normal prior :param T: int, time horizon :param lbda: float, regression regularization parameter :param alpha: float, tunable parameter to control between exploration and exploitation :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ arm_sequence, reward = np.zeros(T), np.zeros(T) a_t, A_t, b_t = np.random.randint(0, self.n_a - 1, 1)[0], lbda * np.eye(self.d), np.zeros(self.d) r_t = self.reward(a_t) for t in range(T): A_t += np.outer(self.features[a_t, :], self.features[a_t, :]) b_t += r_t * self.features[a_t, :] inv_A = np.linalg.inv(A_t) theta_t = np.dot(inv_A, b_t) beta_t = alpha * np.sqrt(np.diagonal(np.dot(np.dot(self.features, inv_A), self.features.T))) a_t = rd_argmax(np.dot(self.features, theta_t) + beta_t) r_t = self.reward(a_t) arm_sequence[t], reward[t] = a_t, r_t return reward, arm_sequence
def UCB_Tuned(self, T): """ Implementation of UCB-tuned algorithm :param T: int, time horizon :param rho: float, parameter for balancing between exploration and exploitation :return: np.arrays, reward obtained by the policy and sequence of chosen arms """ Sa, Na, reward, arm_sequence = self.init_lists(T) S, m = np.zeros(self.nb_arms), np.zeros(self.nb_arms) for t in range(T): if t < self.nb_arms: arm = t else: for arm in range(self.nb_arms): S[arm] = sum([ r**2 for r in reward[np.where(arm_sequence == arm)] ]) / Na[arm] - (Sa[arm] / Na[arm])**2 m[arm] = min(0.25, S[arm] + np.sqrt(2 * np.log(t + 1) / Na[arm])) arm = rd_argmax(Sa / Na + np.sqrt(np.log(t + 1) / Na * m)) self.update_lists(t, arm, Sa, Na, reward, arm_sequence) return reward, arm_sequence