def _multiple_env(n_arms, n_step, n_test, prob_of_change, type_change): np.random.seed() # Build Agents ts = mab.BernoulliThompsonSampling(n_arms) discounted_ts = mab.DiscountedBernoulliTS(n_arms, gamma=0.98) sw_ts = mab.BernoulliSlidingWindowTS(n_arms, n=100) max_dsw_ts = mab.MaxDSWTS(n_arms, gamma=0.99, n=25) min_dsw_ts = mab.MinDSWTS(n_arms, gamma=0.95, n=100) mean_dsw_ts = mab.MeanDSWTS(n_arms, gamma=0.95, n=25) agents = [ts, discounted_ts, sw_ts, max_dsw_ts, min_dsw_ts, mean_dsw_ts] # Build Env with replay replay_env = mab.BernoulliReplayBandit(n_step=n_step, n_arms=n_arms, prob_of_change=prob_of_change, fixed_action_prob=0.0, type_change=type_change) # Build session replay_session = mab.Session(replay_env, agents) # Run session replay_session.run(n_step=n_step, n_test=n_test, use_replay=True) results = { agent: replay_session.get_reward_sum(agent) / n_step for agent in agents } results.update( {"Oracle": replay_session.get_reward_sum("Oracle") / n_step}) return results
def _find_params(self, f_gamma, f_n, sw_n, d_ts_gamma): n_arms = self._train_env._n_arms ########## BUILD AGENTS ########### agent_list = [ mab.MaxDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False), mab.MinDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False), mab.MeanDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False), mab.BernoulliSlidingWindowTS(n_arms=n_arms, n=sw_n, store_estimates=False), mab.DiscountedBernoulliTS(n_arms=n_arms, gamma=d_ts_gamma, store_estimates=False) ] np.random.seed() session = mab.Session(env=self._train_env, agent=agent_list) session.run(n_step=self._n_step_train, n_test=1, use_replay=True) return { str(agent): session.get_reward_sum(agent) for agent in agent_list }
def _run(self, fake) -> Dict: ########## BUILD AGENTS ########### max_dsw_ts = mab.MaxDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) min_dsw_ts = mab.MinDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) mean_dsw_ts = mab.MeanDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) ts = mab.BernoulliThompsonSampling(n_arms=self._n_arms, store_estimates=False) sw_ts = mab.BernoulliSlidingWindowTS(n_arms=self._n_arms, n=240000, store_estimates=False) d_ts = mab.DiscountedBernoulliTS(n_arms=self._n_arms, gamma=0.99999, store_estimates=False) agent_list = [ max_dsw_ts, min_dsw_ts, mean_dsw_ts, ts, sw_ts, d_ts, "random" ] np.random.seed() c = self._compression reward_trace = {agent: [0] for agent in agent_list} reward_sum = {agent: 0 for agent in agent_list} for step in trange(self._n_step): for agent in agent_list: if agent == "random": action = random.randint(6) else: action = agent.select_action() cluster, click = self.select_cluster(step) if (cluster == action) and (click == 1): reward = 1 else: reward = 0 # Update statistics reward_sum[agent] += reward if step % c == 0: reward_trace[agent].append(reward_trace[agent][-1] + reward / c) else: reward_trace[agent][-1] += reward / c #Update agent estimates if agent != "random": agent.update_estimates(action, reward) for agent in agent_list: reward_sum[agent] /= self._n_step return (reward_trace, reward_sum)
def test_bernoulli_algorithms(): n_arms = 4 env = mab.BernoulliBandit(n_arms) greedy_agent = mab.BernoulliGreedy(n_arms) ts_agent = mab.BernoulliThompsonSampling(n_arms) ucb_agent = mab.BernoulliUCB(n_arms, c=1) Discounted_ts_agent = mab.DiscountedBernoulliTS(n_arms, gamma=0.99) session = mab.Session( env, [greedy_agent, ts_agent, ucb_agent, Discounted_ts_agent]) session.run(3000) '''
def custom_environments(n_arms, n_test, test_number: int) -> Tuple: # Build environment replay = {} if test_number == 1: # TEST 1 replay = { 'probabilities': [0.9, 0.7, 0.1, 0.3], 250: [(0, 0.0)], 500: [(1, 0.0)] } elif test_number == 2: # TEST 2 replay = { 'probabilities': [0.0, 0.0, 0.1, 0.3], 250: [(0, 0.7)], 500: [(1, 0.9)] } elif test_number == 3: # TEST 3 replay = {'probabilities': [0.2, 0.3, 0.4, 0.5]} replay_env = mab.BernoulliReplayBandit(replay=replay) # Build Agents ts = mab.BernoulliThompsonSampling(n_arms) discounted_ts = mab.DiscountedBernoulliTS(n_arms, gamma=0.99) sw_ts = mab.BernoulliSlidingWindowTS(n_arms, n=100) max_dsw_ts = mab.MaxDSWTS(n_arms, gamma=0.99, n=50) min_dsw_ts = mab.MinDSWTS(n_arms, gamma=0.95, n=75) mean_dsw_ts = mab.MeanDSWTS(n_arms, gamma=0.99, n=50) agents = [ts, discounted_ts, sw_ts, max_dsw_ts, min_dsw_ts, mean_dsw_ts] # Build session replay_session = mab.Session(replay_env, agents) # Run session replay_session.run(n_step=1000, n_test=n_test, use_replay=True) return pd.DataFrame.from_dict( replay_session._regrets), pd.DataFrame.from_dict( replay_session._real_reward_trace)
def _find_params(n_arms, n_step, n_test, prob_of_change, type_change, f_gamma, f_n, sw_n, d_ts_gamma): ########## BUILD AGENTS ########### max_dsw_ts = mab.MaxDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False) min_dsw_ts = mab.MinDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False) mean_dsw_ts = mab.MeanDSWTS(n_arms=n_arms, gamma=f_gamma, n=f_n, store_estimates=False) sw_ts = mab.BernoulliSlidingWindowTS(n_arms=n_arms, n=sw_n, store_estimates=False) d_ts = mab.DiscountedBernoulliTS(n_arms=n_arms, gamma=d_ts_gamma, store_estimates=False) agents = [max_dsw_ts, min_dsw_ts, mean_dsw_ts, sw_ts, d_ts] np.random.seed() # Build Env with replay replay_env = mab.BernoulliReplayBandit(n_step=n_step, n_arms=n_arms, prob_of_change=prob_of_change, fixed_action_prob=0.0, type_change=type_change) # Build session replay_session = mab.Session(replay_env, agents) # Run session replay_session.run(n_step=n_step, n_test=n_test, use_replay=True) return { str(agent): replay_session.get_reward_sum(agent) / replay_session.get_reward_sum("Oracle") for agent in agents }
def _best_agents(self, n_arms) -> List: return [ mab.MaxDSWTS(n_arms=n_arms, gamma=0.9999, n=800, store_estimates=False), mab.MinDSWTS(n_arms=n_arms, gamma=0.99, n=800, store_estimates=False), mab.MeanDSWTS(n_arms=n_arms, gamma=0.9999, n=800, store_estimates=False), mab.BernoulliThompsonSampling(n_arms=n_arms, store_estimates=False), mab.BernoulliSlidingWindowTS(n_arms=n_arms, n=12800, store_estimates=False), mab.DiscountedBernoulliTS(n_arms=n_arms, gamma=0.9999, store_estimates=False), mab.RandomAlgo(n_arms=n_arms) ]
def _run_mod(self, fake) -> Dict: ########## BUILD AGENTS ########### max_dsw_ts = mab.MaxDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) min_dsw_ts = mab.MinDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) mean_dsw_ts = mab.MeanDSWTS(n_arms=self._n_arms, gamma=0.99999, n=2000, store_estimates=False) ts = mab.BernoulliThompsonSampling(n_arms=self._n_arms, store_estimates=False) sw_ts = mab.BernoulliSlidingWindowTS(n_arms=self._n_arms, n=240000, store_estimates=False) d_ts = mab.DiscountedBernoulliTS(n_arms=self._n_arms, gamma=0.99999, store_estimates=False) agent_list = [ max_dsw_ts, min_dsw_ts, mean_dsw_ts, ts, sw_ts, d_ts, "random" ] np.random.seed() reward_trace = {agent: [0] for agent in agent_list} reward_sum = {agent: 0 for agent in agent_list} effective_steps = {agent: 0 for agent in agent_list} for step in trange(self._n_step): for agent in agent_list: # Check if the agent is already reach the "utils iterations" if effective_steps[agent] < self._termination_step: if agent == "random": action = random.randint(6) else: action = agent.select_action() cluster, click = self.select_cluster(step) reward = 0 if cluster == action: effective_steps[agent] += 1 if click == 1: reward = 1 else: reward = 0 # Update statistics reward_sum[agent] += reward reward_trace[agent].append(reward_trace[agent][-1] + reward) #Update agent estimates if agent != "random": agent.update_estimates(action, reward) else: print(agent, step) agent_list.remove(agent) for key in reward_sum: reward_sum[key] /= self._termination_step return (reward_trace, reward_sum)