def actions(self, context=None) -> Actions: del context if self.__stage == 'initialization': actions = Actions() # default state is normal # 1 pull each for every assigned arm for arm_id in self.__assigned_arms: arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = 1 return actions # self.__stage == 'main' actions = Actions() for pseudo_arm in self.__pseudo_arms: if pseudo_arm.total_pulls >= ( 1 + self.__a * (self.__total_pulls - pseudo_arm.total_pulls)): return actions arm_pull = actions.arm_pulls.add() # map local arm index to the bandits arm index arm_pull.arm.id = self.__assigned_arms[int(np.argmax(self.__ucb))] arm_pull.times = 1 return actions
def test_simple_run(self): arm_num = 5 horizon = 10 learner = UCB(arm_num=arm_num) learner.reset() mock_ucb = np.array([1.2, 1, 1, 1, 1]) # pylint: disable=protected-access learner._UCB__UCB = MagicMock(return_value=mock_ucb) # During the initial time steps, each arm is pulled once for time in range(1, arm_num + 1): assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: {arm_id} > times: 1 > """.format(arm_id=time - 1), Actions()).SerializeToString() learner.update( text_format.Parse( """ arm_feedbacks < arm < id: {arm_id} > rewards: 0 > """.format(arm_id=time - 1), Feedback())) # For the left time steps, arm 0 is always the choice for _ in range(arm_num + 1, horizon + 1): assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: 0 > times: 1 > """, Actions()).SerializeToString() learner.update( text_format.Parse( """ arm_feedbacks < arm < id: 0 > rewards: 0 > """, Feedback()))
def test_simple_run(self): means = [0, 0.5, 0.7, 1] arms = [BernoulliArm(mean) for mean in means] learner = EpsGreedy(arm_num=len(arms)) learner.reset() # Pull each arm once during the initial steps for time in range(1, len(arms) + 1): assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: {arm_id} > times: 1 > """.format(arm_id=time - 1), Actions()).SerializeToString() learner.update( text_format.Parse( """ arm_feedbacks < arm < id: {arm_id} > rewards: 0 > """.format(arm_id=time - 1), Feedback()))
def actions(self, context: Context) -> Actions: del context actions = Actions() if self.__stop: return actions if len(self.__active_arms) <= self.__threshold: # Uniform sampling pulls = np.random.multinomial(self.__budget_left, np.ones(len(self.__active_arms)) / len(self.__active_arms), size=1)[0] i = 0 for arm_id in self.__active_arms: arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = pulls[i] i = i + 1 self.__stop = True else: # Pulls assigned to each arm pulls = math.floor(self.budget / (len(self.__active_arms) * self.__total_rounds)) for arm_id in self.__active_arms: arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = pulls return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: return self.__last_actions # When a non-purchase observation happens, a new episode is started and # a new assortment to be served is calculated self.reward.set_preference_params(self.__UCB()) # Calculate assortment with the maximum reward using optimistic # preference parameters if self.use_local_search: _, best_assortment = local_search_best_assortment( reward=self.reward, random_neighbors=self.random_neighbors, card_limit=self.card_limit, init_assortment=(set( self.__last_actions.arm_pulls[0].arm.set.id) if self.__last_actions else None)) else: _, best_assortment = search_best_assortment( reward=self.reward, card_limit=self.card_limit) arm_pull.arm.set.id.extend(list(best_assortment)) arm_pull.times = 1 self.__last_actions = actions return actions
def test_simple_run(self): arm_num = 5 horizon = 10 learner = Uniform(arm_num=arm_num) learner.reset() for time in range(1, horizon + 1): assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: {arm_id} > times: 1 > """.format(arm_id=(time - 1) % arm_num), Actions()).SerializeToString() learner.update( text_format.Parse( """ arm_feedbacks < arm < id: 0 > rewards: 0 > """, Feedback()))
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() arm_pull.arm.id = (self.__time - 1) % self.arm_num arm_pull.times = 1 return actions
def test_simple_run(self): revenues = np.array([0, 0.7, 0.8, 0.9, 1.0]) horizon = 100 reward = CvarReward(0.7) learner = ThompsonSampling(revenues=revenues, horizon=horizon, reward=reward) # Test warm start learner.reset() assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls { arm { set { id: 1 } } times: 1 } """, Actions()).SerializeToString() learner.reset() # pylint: disable=protected-access learner._ThompsonSampling__within_warm_start = MagicMock( return_value=False) mock_preference_params = np.array([1, 1, 1, 1, 1]) learner._ThompsonSampling__correlated_sampling = MagicMock( return_value=mock_preference_params) assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls { arm { set { id: 1 id: 2 id: 3 id: 4 } } times: 1 } """, Actions()).SerializeToString()
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() arm_pull.arm.id = self.__sample_from_beta_prior( ) if self.__prior_dist == 'beta' else self.__sample_from_gaussian_prior( ) arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: actions = Actions() arm_pull = actions.arm_pulls.add() if self.__time <= self.arm_num: arm_pull.arm.id = self.__time - 1 else: arm_pull.arm.id = int(np.argmin(self.__metrics())) arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() ucb = self.__LinUCB() arm_pull.arm.id = int(np.argmax(ucb, axis=0)) arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() self.__probabilities = (1 - self.__gamma) * self.__weights / sum( self.__weights) + self.__gamma / self.arm_num arm_pull.arm.id = np.random.choice(self.arm_num, 1, p=self.__probabilities)[0] arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: if len(self.__active_arms) == 1: return Actions() actions: Actions if self.__stage == 'main_loop': actions = Actions() for arm_id in self.__active_arms: self.__active_arms[arm_id] = PseudoArm() pulls = math.ceil(2 / (self.__eps_r**2) * (math.log(2) - self.__log_delta_r)) for arm_id in self.__active_arms: arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = pulls else: # self.__stage == 'median_elimination' actions = self.__median_elimination() return actions
def actions(self, context: Context) -> Actions: if self.__stage == 'initialization': actions = Actions() for arm_id in range(self.arm_num): arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = 1 return actions # self.__stage == 'main' actions = Actions() for pseudo_arm in self.__pseudo_arms: if pseudo_arm.total_pulls >= ( 1 + self.__a * (self.__total_pulls - pseudo_arm.total_pulls)): return actions arm_pull = actions.arm_pulls.add() arm_pull.arm.id = int(np.argmax(self.__ucb)) arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() if self.__time <= self.__T_prime: arm_pull.arm.id = (self.__time - 1) % self.arm_num else: arm_pull.arm.id = self.__best_arm arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() if self.__time <= self.arm_num: arm_pull.arm.id = self.__time - 1 else: arm_pull.arm.id = int(np.argmax(self.__MOSS())) arm_pull.times = 1 return actions
def actions(self, context: Context = None) -> Actions: if self.__stage == self.UNASSIGNED: raise Exception("%s: I can\'t act in stage unassigned." % self.name) if self.__stage == self.CENTRALIZED_LEARNING: if self.__round_index > 0: raise Exception("Expected centralized learning in round 0. Got %d." % self.__round_index) if self.__central_algo.get_total_pulls( ) >= self.__num_pulls_per_round[0]: # Early stop the centralized algorithm when it uses more than horizon # / 2 pulls. self.__stage = self.LEARNING self.__arm_to_broadcast = np.random.choice(self.__assigned_arms) self.__round_index += 1 return self.actions() if len(self.__assigned_arms) == 1: self.__stage = self.LEARNING self.__arm_to_broadcast = self.__assigned_arms[0] self.__round_index += 1 return self.actions() central_algo_actions = self.__central_algo.actions() if not central_algo_actions.arm_pulls: # Centralized algorithm terminates before using up horizon / 2 pulls self.__stage = self.LEARNING self.__arm_to_broadcast = self.__central_algo.best_arm self.__round_index += 1 return self.actions() return central_algo_actions elif self.__stage == self.LEARNING: actions = Actions() arm_pull = actions.arm_pulls.add() arm_pull.arm.id = self.__arm_to_broadcast arm_pull.times = self.__num_pulls_per_round[self.__round_index] return actions elif self.__stage == self.COMMUNICATION: actions = Actions() actions.state = Actions.WAIT return actions else: # self.__stage == self.TERMINATION actions = Actions() actions.state = Actions.STOP return actions
def test_simple_run(self): ts_learner = ThompsonSampling(arm_num=4) ts_learner.reset() # pylint: disable=protected-access ts_learner._ThompsonSampling__sample_from_beta_prior = MagicMock( return_value=1) # always pull arm 1 assert ts_learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: 1 > times: 1 > """, Actions()).SerializeToString()
def __warm_start(self) -> Actions: """Initial warm start stage Returns: assortments to serve in the warm start stage """ # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: # Continue serving the same assortment return self.__last_actions actions = Actions() arm_pull = actions.arm_pulls.add() arm_pull.arm.set.id.append(self.__next_product_in_warm_start) arm_pull.times = 1 self.__next_product_in_warm_start += 1 return actions
def actions(self, context: Context) -> Actions: del context actions: Actions # Check if still in warm start stage if self.__within_warm_start(): actions = self.__warm_start() else: actions = Actions() arm_pull = actions.arm_pulls.add() # Check if last observation is a purchase if self.__last_customer_feedback and self.__last_customer_feedback != 0: # Continue serving the same assortment return self.__last_actions # When a non-purchase observation happens, a new episode is started. Also # a new assortment to be served using new estimate of preference # parameters is generated. # Set preference parameters generated by thompson sampling self.reward.set_preference_params(self.__correlated_sampling()) # Calculate best assortment using the generated preference parameters if self.use_local_search: # Initial assortment to start for local search if self.__last_actions is not None: init_assortment = set( self.__last_actions.arm_pulls[0].arm.set.id) else: init_assortment = None _, best_assortment = local_search_best_assortment( reward=self.reward, random_neighbors=self.random_neighbors, card_limit=self.card_limit, init_assortment=init_assortment) else: _, best_assortment = search_best_assortment( reward=self.reward, card_limit=self.card_limit) arm_pull.arm.set.id.extend(list(best_assortment)) arm_pull.times = 1 # self.__first_step_after_warm_start = False self.__last_actions = actions return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() if self.__time <= self.arm_num: arm_pull.arm.id = self.__time - 1 # With probability eps/t, randomly select an arm to pull elif np.random.random() <= self.__eps / self.__time: arm_pull.arm.id = np.random.randint(0, self.arm_num) else: arm_pull.arm.id = int( np.argmax(np.array([arm.em_mean for arm in self.__pseudo_arms]))) arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() arm_pull = actions.arm_pulls.add() if self.__time <= self.arm_num: arm_pull.arm.id = self.__time - 1 else: weights = np.array([ math.exp(self.__pseudo_arms[arm_id].em_mean / self.__gamma) for arm_id in range(self.arm_num) ]) arm_pull.arm.id = np.random.choice( self.arm_num, 1, p=[weight / sum(weights) for weight in weights])[0] arm_pull.times = 1 return actions
def actions(self, context: Context) -> Actions: del context actions = Actions() if not self.__stop: # Make sure each arm is sampled at least once pulls = np.random.multinomial(self.budget - self.arm_num, np.ones(self.arm_num) / self.arm_num, size=1)[0] for arm_id in range(self.arm_num): arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = pulls[arm_id] + 1 self.__stop = True return actions
def test_one_product(self): preference_params = [1.0, 0.0] revenues = [0.0, 1.0] bandit = MNLBandit(preference_params, revenues) bandit.reset() # Always get no purchase assert set( bandit.feed( text_format.Parse( """ arm_pulls { arm { set { id: 1 } } times: 5 } """, Actions())).arm_feedbacks[0].customer_feedbacks) == {0}
def actions(self, context: Context) -> Actions: del context actions = Actions() if self.__round < self.arm_num: if self.__round < self.arm_num - 1: for arm_id in self.__active_arms: arm_pull = actions.arm_pulls.add() arm_pull.arm.id = arm_id arm_pull.times = self.__pulls_per_round[self.__round] else: # Use up the remaining budget when there are only two arms left pulls = [self.__budget_left // 2] pulls.append(self.__budget_left - pulls[0]) for i in range(2): arm_pull = actions.arm_pulls.add() arm_pull.arm.id = list(self.__active_arms.keys())[i] arm_pull.times = pulls[i] return actions
def test_simple_run(self): means = [0, 1] arms = [BernoulliArm(mean) for mean in means] ordinary_bandit = MultiArmedBandit(arms) ordinary_bandit.reset() # Pull arm 0 for 100 times actions = text_format.Parse( """ arm_pulls { arm { id: 0 } times: 100 } """, Actions()) ordinary_bandit.feed(actions) assert ordinary_bandit.regret(MaximizeTotalRewards()) == 100 arm = Arm() arm.id = 1 assert ordinary_bandit.regret(IdentifyBestArm(best_arm=arm)) == 0
def test_contextual_bandit(self): arm_num = 10 dimension = 10 contextual_bandit = ContextualBandit( RandomContextGenerator(arm_num=arm_num, dimension=dimension)) contextual_bandit.reset() for _ in range(20): _ = contextual_bandit.context contextual_bandit.feed( text_format.Parse( """ arm_pulls { arm { id: 1 } times: 1 } """, Actions())) assert contextual_bandit.regret(MaximizeTotalRewards()) <= 20
def test_regret(self): preference_params = np.array( [1.0, 1.0, 1.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5]) revenues = np.array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) card_limit = 1 bandit = MNLBandit(preference_params, revenues, card_limit) bandit.reset() # Serve best assortment {1} for 3 times bandit.feed( text_format.Parse( """ arm_pulls { arm { set { id: 1 } } times: 3 } """, Actions())) assert bandit.regret(MaximizeTotalRewards()) == 0.0
def test_simple_run(self): horizon = 10 features = [ np.array([1, 0]), np.array([1, 0]), np.array([1, 0]), np.array([1, 0]), np.array([0, 1]) ] learner = LinUCB(features, 0.1, 1e-3) learner.reset() mock_ucb = np.array([1.2, 1, 1, 1, 1]) # pylint: disable=protected-access learner._LinUCB__LinUCB = MagicMock(return_value=mock_ucb) # Always 0th arm is picked # not the most efficient test for _ in range(1, horizon + 1): assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls < arm < id: 0 > times: 1 > """, Actions()).SerializeToString() learner.update( text_format.Parse( """ arm_feedbacks < arm < id: 0 > rewards: 0 > """, Feedback()))
def test_simple_run(self): revenues = np.array([0, 0.45, 0.8, 0.9, 1.0]) reward = MeanReward() learner = EpsGreedy(revenues=revenues, reward=reward) learner.reset() mock_random_assortment = {2, 3, 4} # pylint: disable=protected-access learner._EpsGreedy__select_ramdom_assort = MagicMock( return_value=mock_random_assortment) assert learner.actions( Context()).SerializeToString() == text_format.Parse( """ arm_pulls { arm { set { id: 2 id: 3 id: 4 } } times: 1 } """, Actions()).SerializeToString()