def pick_action(self, observation): """Take random action prob epsilon, else be greedy.""" if np.random.rand() < self.epsilon: action = np.random.randint(self.n_arm) else: posterior_means = self.get_posterior_mean() action = random_argmax(posterior_means) return action
def pick_action(self, observation): """Thompson sampling with Beta posterior for action selection.""" sampled_means = self.get_posterior_sample() if random.random() < .2: action = np.argmax(sampled_means[-1]) else: action = random_argmax(sampled_means[:-1]) return action
def pick_action(self, observation): """Take random action prob epsilon, else be greedy.""" if np.random.rand() < self.epsilon: action = np.random.randint(self.n_arm) # 从n个arm中随机选择一个 else: # 1-epsilon greedy # 所谓reward, 就是success平均值 posterior_means = self.get_posterior_mean() # shape:[arm, 1], 从中选择一个reward最大的arm action = random_argmax(posterior_means) return action
def find_optimal_assortment(self, theta_hat): '''finds the optimal assortment, given a sampled parameter.''' # generating all possible assortments assortment_tuples = list(itertools.product([0, 1], repeat=self.num_products)) total_profit = [] for assortment in assortment_tuples: expected_demand = np.array(assortment)*np.exp(self.noise_var/2 + theta_hat.dot(np.array(assortment))) total_profit.append(expected_demand.dot(self.profits)) optimal_ind = random_argmax(np.array(total_profit)) return np.array(assortment_tuples[optimal_ind])
def _find_optimal_assortment(self): '''finds the optimal assortment of the products.''' # generating all possible assortments assortment_tuples = list( itertools.product([0, 1], repeat=self.num_products)) total_profit = [] for assortment in assortment_tuples: expected_demand = np.array(assortment) * np.exp( self.noise_var / 2 + self.theta.dot(np.array(assortment))) total_profit.append(expected_demand.dot(self.profits)) optimal_ind = random_argmax(np.array(total_profit)) self.optimal_assortment = np.array(assortment_tuples[optimal_ind]) self.optimal_profit = total_profit[optimal_ind]
def pick_action(self, observation): """Thompson sampling with Beta posterior for action selection.""" sampled_means = self.get_posterior_sample() action = random_argmax(sampled_means) return action
def pick_action(self, observation): """Thompson sampling with Beta posterior for action selection.""" # 注意: 只有此处不一样, 即TS里是从后验分布中采样,而epsilon-greedy是计算期望 sampled_means = self.get_posterior_sample() # 每个arm都采样一个reward均值, [arm, 1] action = random_argmax(sampled_means) # 选择产生最大的均值的action return action