def update(self, x: np.matrix, chosen_arm: int, reward: Union[int, float]) -> None: """Update the reward and parameter information about earch arm. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. chosen_arm: int The chosen arm. reward: int, float The observed reward value from the chosen arm. """ x = _check_x_input(x) self.data_size += 1 self.counts[chosen_arm] += 1 self.rewards += reward self._A_inv[chosen_arm] -= \ self._A_inv[chosen_arm] @ x @ x.T @ self._A_inv[chosen_arm] / (1 + x.T @ self._A_inv[chosen_arm] @ x) # d * d self._b[:, chosen_arm] += np.ravel(x) * reward # d * 1 if self.data_size % self.batch_size == 0: self.A_inv, self.b = np.copy(self._A_inv), np.copy( self._b) # d * d, d * 1
def pull(self, chosen_arm: int, x: Optional[np.ndarray] = None) -> Union[int, float]: """Pull arms. chosen_arm: int The chosen arm. x : array-like, shape = (n_features, ), optional(default=None) A test sample. """ if self.contextual: x, e = _check_x_input(x), np.random.normal(loc=0, scale=self.noise) mu = np.ravel(x.T @ self.params) reward, regret, self.best_arm = \ np.random.binomial(n=1, p=sigmoid(mu[chosen_arm] + e)), \ np.max(mu) - mu[chosen_arm], np.argmax(mu) else: reward, regret = \ np.random.binomial(n=1, p=self.mu[chosen_arm]), self.mu_max - self.mu[chosen_arm] self.rewards += reward self.regrets += regret return reward
def select_arm(self, x: np.ndarray) -> int: """Select arms according to the policy for new data. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. Returns ------- result: int The selected arm. """ if True in (self.counts < self.warmup): result = np.argmax(np.array(self.counts < self.warmup, dtype=int)) else: x = _check_x_input(x) self.theta_hat = np.concatenate([ self.A_inv[i] @ np.expand_dims(self.b[:, i], axis=1) for i in np.arange(self.n_arms) ], axis=1) # user_dim * n_arms sigma_hat = np.concatenate([ np.sqrt(x.T @ self.A_inv[i] @ x) for i in np.arange(self.n_arms) ], axis=1) # 1 * n_arms result = np.argmax(x.T @ self.theta_hat + self.alpha * sigma_hat) return result
def update(self, x: np.ndarray, chosen_arm: int, reward: float) -> None: """Update the reward and parameter information about earch arm. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. chosen_arm: int The chosen arm. reward: int, float The observed reward value from the chosen arm. """ x = _check_x_input(x) self.counts[chosen_arm] += 1 self.rewards += reward self.data_stock[chosen_arm].append(x) # (user_dim + arm_dim) * 1 self.reward_stock[chosen_arm].append(reward) self.data_size += 1 if self.data_size % self.batch_size == 0: for i in np.arange(self.n_iter): self.theta_hat[:, chosen_arm], self.hessian_inv[chosen_arm] = \ self._update_theta_hat(chosen_arm, self.theta_hat[:, chosen_arm])
def select_arm(self, x: np.ndarray) -> int: """Select arms according to the policy for new data. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. Returns ------- result: int The selected arm. """ if True in (self.counts < self.warmup): result = np.argmax(np.array(self.counts < self.warmup, dtype=int)) else: x = _check_x_input(x) if self.data_size % self.sample_batch == 0: self.theta_tilde = np.concatenate([ np.expand_dims(np.random.multivariate_normal( self.theta_hat[:, i], self.hessian_inv[i]), axis=1) for i in np.arange(self.n_arms) ], axis=1) result = np.argmax(x.T @ self.theta_tilde) return result
def select_arm(self, x: np.ndarray) -> int: """Select arms according to the policy for new data. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. Returns ------- result: int The selected arm. """ if True in (self.counts < self.warmup): result = np.argmax(np.array(self.counts < self.warmup, dtype=int)) else: z, x = _check_x_input(x[:self.z_dim]), _check_x_input( x[self.z_dim:]) self.beta = np.linalg.inv(self.A_zero) @ self.b_zero # k * 1 self.theta_hat = np.concatenate( [(self.A_inv[i] @ (np.expand_dims(self.b[:, i], axis=1) - self.B[i] @ self.beta)) for i in np.arange(self.n_arms)], axis=1) s1 = z.T @ np.linalg.inv(self.A_zero) @ z s2 = -2 * np.concatenate([ z.T @ np.linalg.inv(self.A_zero) @ self.B[i].T @ self.A_inv[i] @ x for i in np.arange(self.n_arms) ], axis=1) s3 = np.concatenate( [x.T @ self.A_inv[i] @ x for i in np.arange(self.n_arms)], axis=1) s4 = np.concatenate([ x.T @ self.A_inv[i] @ self.B[i] @ np.linalg.inv(self.A_zero) @ self.B[i].T @ self.A_inv[i] @ x for i in np.arange(self.n_arms) ], axis=1) sigma_hat = s1 + s2 + s3 + s4 result = np.argmax(z.T @ self.beta + x.T @ self.theta_hat + self.alpha * sigma_hat) return result
def update(self, x: np.ndarray, chosen_arm: int, reward: float) -> None: """Update the reward and parameter information about earch arm. Parameters ---------- x : array-like, shape = (n_features, ) A test sample. chosen_arm: int The chosen arm. reward: int, float The observed reward value from the chosen arm. """ z, x = _check_x_input(x[:self.z_dim]), _check_x_input(x[self.z_dim:]) self.data_size += 1 self.counts[chosen_arm] += 1 self.rewards += reward self._A_zero += self._B[chosen_arm].T @ self._A_inv[ chosen_arm] @ self._B[chosen_arm] self._b_zero += self._B[chosen_arm].T @ self._A_inv[ chosen_arm] @ self._b[chosen_arm] self._A_inv[chosen_arm] -= self._A_inv[ chosen_arm] @ x @ x.T @ self._A_inv[chosen_arm] / ( 1 + x.T @ self._A_inv[chosen_arm] @ x) self._B[chosen_arm] += x @ z.T self._b[:, chosen_arm] += np.ravel(x) * reward self._A_zero += z @ z.T - self._B[chosen_arm].T @ self._A_inv[ chosen_arm] @ self._B[chosen_arm] self._b_zero += z * reward - self._B[chosen_arm].T @ self._A_inv[ chosen_arm] @ np.expand_dims(self._b[:, chosen_arm], axis=1) if self.data_size % self.batch_size == 0: self.A_zero, self.b_zero = np.copy(self._A_zero), np.copy( self._b_zero) self.A_inv, self.B, self.b = np.copy(self._A_inv), np.copy( self._B), np.copy(self._b)