def pick_action(self, context): # context contains all context_vectors for all arms self.context = context self.means = np.array([ self._compute_mean(self.theta_hat[i], context[i]) for i in range(self.k_arms) ]) return random_argmax(self.means)
def observe_reward(self, arm_idx, reward): self.arms_data[arm_idx].append(reward) n = len(self.arms_data[arm_idx]) self.estimated_means[arm_idx] = reward / n + ( n - 1) * self.estimated_means[arm_idx] / n self.pulls[arm_idx] += 1 self.best_arm = random_argmax(self.estimated_means) self._t += 1
def pick_action(self): # try each arm once then compute UCBs if self._t > self.k_arms: self._UCBs = [ self._bound_function(arm_idx) for arm_idx in self._arm_idxs ] arm_idx = random_argmax(self._UCBs) else: arm_idx = next(self._try_each_arm) return arm_idx
def pick_action(self): # try each arm once then compute UCBs if self._t > self.k_arms: self.Vs = self.sq_sums - self.estimated_means**2 + self.radius( self._t, self.pulls) self._UCBs = self.estimated_means + np.sqrt( (np.log(self._t) / self.pulls) * self.Vs) arm_idx = random_argmax(self._UCBs) else: arm_idx = next(self._try_each_arm) return arm_idx
def pick_action(self): # try each arm once then compute UCBs if self._t > self.k_arms: self._UCBs = self.estimated_means + self.radius( self._t, self.pulls) arm_idx = random_argmax(self._UCBs) if self.keep_history: self.UCB_history.append(self._UCBs.copy()) self.means_history.append(self.estimated_means.copy()) self.pulls_history.append(self.pulls.copy()) else: arm_idx = next(self._try_each_arm) return arm_idx
def generate_context(self): """Generates context vector of indicators and computes current real reward probability """ context = [] context_vector = bernuolli(self.context_options) if self.add_bias: context_vector = np.append([1], context_vector) for i in range(self.k_arms): context.append(context_vector) # pull all arms to generate current mean, rewards and get the optimal one # agent/policy/algorithm knows only the context but means are not revealed self.current_rewards = [ arm.pull(context_vector) for arm, context_vector in zip(self.arms, context) ] self.current_means = [arm.get_current_mean() for arm in self.arms] self.current_optimal_arm = random_argmax(self.current_means) self.current_optimal_mean = self.current_means[ self.current_optimal_arm] return context
def pick_action(self): if self.keep_history: self.prior_data_history.append(self.prior_data.copy()) self.means_history.append(self.estimated_means.copy()) self.pulls_history.append(self.pulls.copy()) return random_argmax(self._sample_from_arms())