def IDSAction(self, delta, g):
     """
     Implementation of IDSAction algorithm as defined in Russo & Van Roy, p. 242
     :param delta: np.array, instantaneous regrets
     :param g: np.array, information gains
     :return: int, arm to pull
     """
     Q = np.zeros((self.nb_arms, self.nb_arms))
     IR = np.ones((self.nb_arms, self.nb_arms)) * np.inf
     q = np.linspace(0, 1, 1000)
     for a in range(self.nb_arms - 1):
         for ap in range(a + 1, self.nb_arms):
             if g[a] < 1e-6 or g[ap] < 1e-6:
                 return rd_argmax(-g)
             da, dap, ga, gap = delta[a], delta[ap], g[a], g[ap]
             qaap = q[rd_argmax(-(q * da + (1 - q) * dap)**2 /
                                (q * ga + (1 - q) * gap))]
             IR[a, ap] = (qaap * (da - dap) + dap)**2 / (qaap *
                                                         (ga - gap) + gap)
             Q[a, ap] = qaap
     amin = rd_argmax(-IR.reshape(self.nb_arms * self.nb_arms))
     a, ap = amin // self.nb_arms, amin % self.nb_arms
     b = np.random.binomial(1, Q[a, ap])
     arm = int(b * a + (1 - b) * ap)
     if self.store_IDS:
         self.IDS_results['arms'].append(arm)
         policy = np.zeros(self.nb_arms)
         policy[a], policy[ap] = Q[a, ap], (1 - Q[a, ap])
         self.IDS_results['policy'].append(policy)
         self.IDS_results['delta'].append(delta)
         self.IDS_results['g'].append(g)
         self.IDS_results['IR'].append(
             np.inner(delta**2, policy) / np.inner(g, policy))
     return arm
コード例 #2
0
 def computeVIDS(self, mu_t, sigma_t, M):
     """
     Implementation of linearSampleVIR (algorithm 6 in Russo & Van Roy, p. 244) applied for Linear  Bandits with
     multivariate normal prior. Here integrals are approximated in sampling thetas according to their respective
     posterior distributions.
     :param mu_t: np.array, posterior mean vector at time t
     :param sigma_t: np.array, posterior covariance matrix at time t
     :param M: int, number of samples
     :return: int, np.array, arm chose and p*
     """
     thetas = np.random.multivariate_normal(mu_t, sigma_t, M)
     mu = np.mean(thetas, axis=0)
     theta_hat = np.argmax(np.dot(self.features, thetas.T), axis=0)
     theta_hat_ = [thetas[np.where(theta_hat==a)] for a in range(self.n_a)]
     p_a = np.array([len(theta_hat_[a]) for a in range(self.n_a)])/M
     if np.max(p_a) >= self.threshold:
         # Stop learning policy
         self.optimal_arm = np.argmax(p_a)
         arm = self.optimal_arm
     else:
         mu_a = np.nan_to_num(np.array([np.mean([theta_hat_[a]], axis=1).squeeze() for a in range(self.n_a)]))
         L_hat = np.sum(np.array([p_a[a]*np.outer(mu_a[a]-mu, mu_a[a]-mu) for a in range(self.n_a)]), axis=0)
         rho_star = np.sum(np.array([p_a[a]*np.dot(self.features[a], mu_a[a]) for a in range(self.n_a)]), axis=0)
         v = np.array([np.dot(np.dot(self.features[a], L_hat), self.features[a].T) for a in range(self.n_a)])
         delta = np.array([rho_star - np.dot(self.features[a], mu) for a in range(self.n_a)])
         arm = rd_argmax(-delta**2/v)
     return arm, p_a
コード例 #3
0
 def TS(self, T):
     """
     Implementation of Thomson Sampling (TS) algorithm for Linear Bandits with multivariate normal prior
     :param T: int, time horizon
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     arm_sequence, reward = np.zeros(T), np.zeros(T)
     mu_t, sigma_t = self.initPrior()
     for t in range(T):
         theta_t = np.random.multivariate_normal(mu_t, sigma_t, 1).T
         a_t = rd_argmax(np.dot(self.features, theta_t))
         r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t)
         reward[t], arm_sequence[t] = r_t, a_t
     return reward, arm_sequence
コード例 #4
0
 def BayesUCB(self, T):
     """
     Implementation of Bayesian Upper Confidence Bounds (BayesUCB) algorithm for Linear Bandits with multivariate
     normal prior
     :param T: int, time horizon
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     arm_sequence, reward = np.zeros(T), np.zeros(T)
     mu_t, sigma_t = self.initPrior()
     for t in range(T):
         a_t = rd_argmax(np.dot(self.features, mu_t) + norm.ppf(t/(t+1)) *
                         np.sqrt(np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T))))
         r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t)
         reward[t], arm_sequence[t] = r_t, a_t
     return reward, arm_sequence
 def UCB1(self, T, rho):
     """
     Implementation of UCB1 algorithm
     :param T: int, time horizon
     :param rho: float, parameter for balancing between exploration and exploitation
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     Sa, Na, reward, arm_sequence = self.init_lists(T)
     for t in range(T):
         if t < self.nb_arms:
             arm = t
         else:
             arm = rd_argmax(Sa / Na +
                             rho * np.sqrt(np.log(t + 1) / 2 / Na))
         self.update_lists(t, arm, Sa, Na, reward, arm_sequence)
     return reward, arm_sequence
コード例 #6
0
 def GPUCB(self, T):
     """
     Implementation of GPUCB, Srinivas (2010) 'Gaussian Process Optimization in the Bandit Setting: No Regret and
     Experimental Design' for Linear Bandits with multivariate normal prior
     :param T: int, time horizon
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     arm_sequence, reward = np.zeros(T), np.zeros(T)
     mu_t, sigma_t = self.initPrior()
     for t in range(T):
         beta_t = 2 * np.log(self.n_a * ((t+1)*np.pi)**2 / 6 / 0.1)
         a_t = rd_argmax(np.dot(self.features, mu_t) +
                         np.sqrt(beta_t * np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T))))
         r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t)
         reward[t], arm_sequence[t] = r_t, a_t
     return reward, arm_sequence
コード例 #7
0
 def Tuned_GPUCB(self, T, c=0.9):
     """
     Implementation of Tuned GPUCB described in Russo & Van Roy's paper of study for Linear Bandits with
     multivariate normal prior
     :param T: int, time horizon
     :param c: float, tunable parameter. Default 0.9
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     arm_sequence, reward = np.zeros(T), np.zeros(T)
     mu_t, sigma_t = self.initPrior()
     for t in range(T):
         beta_t = c * np.log(t+1)
         a_t = rd_argmax(np.dot(self.features, mu_t) +
                         np.sqrt(beta_t*np.diagonal(np.dot(np.dot(self.features, sigma_t), self.features.T))))
         r_t, mu_t, sigma_t = self.updatePosterior(a_t, mu_t, sigma_t)
         reward[t], arm_sequence[t] = r_t, a_t
     return reward, arm_sequence
 def MOSS(self, T, rho):
     """
     Implementation of Minimax Optimal Strategy in the Stochastic case (MOSS).
     :param T: int, time horizon
     :param rho: float, parameter for balancing between exploration and exploitation
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     Sa, Na, reward, arm_sequence = self.init_lists(T)
     for t in range(T):
         if t < self.nb_arms:
             arm = t
         else:
             root_term = np.array(
                 list(map(lambda x: max(x, 1), T / (self.nb_arms * Na))))
             arm = rd_argmax(Sa / Na +
                             rho * np.sqrt(4 / Na * np.log(root_term)))
         self.update_lists(t, arm, Sa, Na, reward, arm_sequence)
     return reward, arm_sequence
コード例 #9
0
 def LinUCB(self, T, lbda=10e-4, alpha=10e-1):
     """
     Implementation of Linear UCB algorithm for Linear Bandits with multivariate normal prior
     :param T: int, time horizon
     :param lbda: float, regression regularization parameter
     :param alpha: float, tunable parameter to control between exploration and exploitation
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     arm_sequence, reward = np.zeros(T), np.zeros(T)
     a_t, A_t, b_t = np.random.randint(0, self.n_a - 1, 1)[0], lbda * np.eye(self.d), np.zeros(self.d)
     r_t = self.reward(a_t)
     for t in range(T):
         A_t += np.outer(self.features[a_t, :], self.features[a_t, :])
         b_t += r_t * self.features[a_t, :]
         inv_A = np.linalg.inv(A_t)
         theta_t = np.dot(inv_A, b_t)
         beta_t = alpha * np.sqrt(np.diagonal(np.dot(np.dot(self.features, inv_A), self.features.T)))
         a_t = rd_argmax(np.dot(self.features, theta_t) + beta_t)
         r_t = self.reward(a_t)
         arm_sequence[t], reward[t] = a_t, r_t
     return reward, arm_sequence
 def UCB_Tuned(self, T):
     """
     Implementation of UCB-tuned algorithm
     :param T: int, time horizon
     :param rho: float, parameter for balancing between exploration and exploitation
     :return: np.arrays, reward obtained by the policy and sequence of chosen arms
     """
     Sa, Na, reward, arm_sequence = self.init_lists(T)
     S, m = np.zeros(self.nb_arms), np.zeros(self.nb_arms)
     for t in range(T):
         if t < self.nb_arms:
             arm = t
         else:
             for arm in range(self.nb_arms):
                 S[arm] = sum([
                     r**2 for r in reward[np.where(arm_sequence == arm)]
                 ]) / Na[arm] - (Sa[arm] / Na[arm])**2
                 m[arm] = min(0.25,
                              S[arm] + np.sqrt(2 * np.log(t + 1) / Na[arm]))
             arm = rd_argmax(Sa / Na + np.sqrt(np.log(t + 1) / Na * m))
         self.update_lists(t, arm, Sa, Na, reward, arm_sequence)
     return reward, arm_sequence