예제 #1
0
    def pick_action(self, observation):
        """Take random action prob epsilon, else be greedy."""
        if np.random.rand() < self.epsilon:
            action = np.random.randint(self.n_arm)
        else:
            posterior_means = self.get_posterior_mean()
            action = random_argmax(posterior_means)

        return action
예제 #2
0
    def pick_action(self, observation):
        """Thompson sampling with Beta posterior for action selection."""
        sampled_means = self.get_posterior_sample()

        if random.random() < .2:
            action = np.argmax(sampled_means[-1])
        else:
            action = random_argmax(sampled_means[:-1])
        return action
예제 #3
0
  def pick_action(self, observation):
    """Take random action prob epsilon, else be greedy."""
    if np.random.rand() < self.epsilon:
      action = np.random.randint(self.n_arm) # 从n个arm中随机选择一个
    else: # 1-epsilon greedy
      # 所谓reward, 就是success平均值
      posterior_means = self.get_posterior_mean() # shape:[arm, 1], 从中选择一个reward最大的arm
      action = random_argmax(posterior_means)

    return action
예제 #4
0
 def find_optimal_assortment(self, theta_hat):
   '''finds the optimal assortment, given a sampled parameter.'''
   # generating all possible assortments
   assortment_tuples = list(itertools.product([0, 1], repeat=self.num_products))
   total_profit = []
   for assortment in assortment_tuples:
     expected_demand = np.array(assortment)*np.exp(self.noise_var/2 + 
                               theta_hat.dot(np.array(assortment)))
     total_profit.append(expected_demand.dot(self.profits))
   optimal_ind = random_argmax(np.array(total_profit))
   return np.array(assortment_tuples[optimal_ind])
예제 #5
0
    def _find_optimal_assortment(self):
        '''finds the optimal assortment of the products.'''

        # generating all possible assortments
        assortment_tuples = list(
            itertools.product([0, 1], repeat=self.num_products))
        total_profit = []
        for assortment in assortment_tuples:
            expected_demand = np.array(assortment) * np.exp(
                self.noise_var / 2 + self.theta.dot(np.array(assortment)))
            total_profit.append(expected_demand.dot(self.profits))
        optimal_ind = random_argmax(np.array(total_profit))
        self.optimal_assortment = np.array(assortment_tuples[optimal_ind])
        self.optimal_profit = total_profit[optimal_ind]
예제 #6
0
 def pick_action(self, observation):
     """Thompson sampling with Beta posterior for action selection."""
     sampled_means = self.get_posterior_sample()
     action = random_argmax(sampled_means)
     return action
예제 #7
0
 def pick_action(self, observation):
   """Thompson sampling with Beta posterior for action selection."""
   # 注意: 只有此处不一样, 即TS里是从后验分布中采样,而epsilon-greedy是计算期望
   sampled_means = self.get_posterior_sample() # 每个arm都采样一个reward均值, [arm, 1]
   action = random_argmax(sampled_means) # 选择产生最大的均值的action
   return action