Ejemplo n.º 1
0
def greedy_policy():
    # Agent chooses by the greedy policy
    rewards = np.zeros((len(epsilons), num_sessions, num_trials))
    num_best = np.zeros((len(epsilons), num_sessions, num_trials))

    for i in range(len(epsilons)):
        policy = GreedyPolicy()
        bandit = GaussianBandit(n)
        agent = Agent(n, policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    ave_reward = rewards[i, :, :].mean(axis=0)
    plt.plot(ave_reward)
    plt.title("Average Reward")
    plt.xlabel('Trial')
    plt.ylabel('Reward')
    plt.show()

    ave_percent_best = num_best[i, :, :].mean(axis=0)
    plt.plot(ave_percent_best)
    plt.title("Average Percent Best Option")
    plt.xlabel('Trial')
    plt.ylabel('Percent Best Option')
    plt.show()
Ejemplo n.º 2
0
def run_bandit(epsilon, n, num_trials, num_sessions):
    # Runs the bandit for a single epsilon, n
    policy = EpsilonGreedyPolicy(epsilon)
    bandit = GaussianBandit(n)
    agent = Agent(n, policy, num_trials)
    env = Environment(bandit, agent, num_trials, num_sessions)
    rewards, num_best = env.run()

    plot_ave_reward(rewards)
    plt.show()

    plot_percent_best_action(num_best)
    plt.show()
Ejemplo n.º 3
0
def compare_epsilons(n, epsilons):
    # Compare various values of n and epsilon

    # maximizer: epsilon = 1, complete exploration
    # satisficer: epsilon = 0, complete exploitation
    rewards = np.zeros((len(epsilons), num_sessions, num_trials))
    num_best = np.zeros((len(epsilons), num_sessions, num_trials))
    ave_reward = np.zeros((len(epsilons), num_trials))
    cum_reward = np.zeros(num_sessions)
    ave_cum_reward = np.zeros((len(epsilons), 2))

    for i in range(len(epsilons)):
        policy = EpsilonGreedyPolicy(epsilons[i])
        bandit = GaussianBandit(n)
        agent = Agent(n, policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    # Compare average reward across values of epsilon
    color = iter(cm.rainbow(np.linspace(0, 1, len(epsilons))))
    for i in range(len(epsilons)):
        c = next(color)
        ave_reward[i, :] = rewards[i, :, :].mean(axis=0)
        plt.plot(ave_reward[i, :], label="Epsilon:" + str(epsilons[i]), c=c)
        plt.title("Average Reward" + ", n: " + str(n))
        plt.xlabel('Trial')
        plt.ylabel('Reward')
        plt.legend(loc="upper left")
        plt.rc('legend', fontsize='x-small')
    plt.show()

    color2 = iter(cm.rainbow(np.linspace(0, 1, len(epsilons))))
    for i in range(len(epsilons)):
        c = next(color2)
        ave_percent_best = num_best[i, :, :].mean(axis=0)
        plt.plot(ave_percent_best, label="Epsilon:" + str(epsilons[i]), c=c)
        plt.title("Average Percent Best Option" + ", n: " + str(n))
        plt.xlabel('Trial')
        plt.ylabel('Percent Best Option')
        plt.legend(loc="upper left")
        plt.rc('legend', fontsize='x-small')
    plt.show()

    for i in range(len(epsilons)):
        for j in range(num_sessions):
            cum_reward[j] = rewards[i, j, :].sum()
        ave_cum_reward[i, :] = [epsilons[i], np.mean(cum_reward)]
    print(np.shape(cum_reward))
    print(np.shape(ave_cum_reward))
    print(ave_cum_reward)
Ejemplo n.º 4
0
    def __init__(self, n: int, mu: float, std: float, noise: float) -> None:
        """
		Constructs an N-armed bandit environment.

		@param n: The number of bandit arms.
		@param mu: The mean of the bandits' true rewards.
		@param std: The standard deviation of the bandits' true rewards.
		@param noise: The standard deviation of the Gaussian noise around rewards.
		"""
        super(GaussBanditEnvironment, self).__init__()
        self._mu = mu
        self._std = std
        self._noise = noise
        self._rng = np.random.default_rng()
        self._bandits = [
            GaussianBandit(self._rng.normal(self._mu, self._std), self._noise)
            for _ in range(n)
        ]
Ejemplo n.º 5
0
def compare_n(n_list):
    # Compare across values of n
    rewards = np.zeros((len(n_list), num_sessions, num_trials))
    num_best = np.zeros((len(n_list), num_sessions, num_trials))
    cum_reward = np.zeros(num_sessions)
    ave_cum_reward = np.zeros((len(n_list), 2))

    for i in range(len(n_list)):
        policy = EpsilonGreedyPolicy(epsilon)
        bandit = GaussianBandit(n_list[i])
        agent = Agent(n_list[i], policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    # Compare average reward across values of epsilon
    color = iter(cm.rainbow(np.linspace(0, 1, len(n_list))))
    for i in range(len(n_list)):
        c = next(color)
        ave_reward = rewards[i, :, :].mean(axis=0)
        plt.plot(ave_reward, label="n:" + str(n_list[i]), c=c)
        plt.title("Average Reward")
        plt.xlabel('Trial')
        plt.ylabel('Reward')
        plt.legend(loc="upper left")
    plt.show()

    color2 = iter(cm.rainbow(np.linspace(0, 1, len(n_list))))
    for i in range(len(n_list)):
        c = next(color2)
        ave_percent_best = num_best[i, :, :].mean(axis=0)
        plt.plot(ave_percent_best, label="n:" + str(n_list[i]), c=c)
        plt.title("Average Percent Best Option")
        plt.xlabel('Trial')
        plt.ylabel('Percent Best Option')
        plt.legend(loc="upper left")
    plt.show()

    for i in range(len(n_list)):
        for j in range(num_sessions):
            cum_reward[j] = rewards[i, j, :].sum()
        ave_cum_reward[i, :] = [n_list[i], np.mean(cum_reward)]
    print(np.shape(cum_reward))
    print(np.shape(ave_cum_reward))
    print(ave_cum_reward)
Ejemplo n.º 6
0
 def reset(self) -> None:
     self._bandits = \
      [GaussianBandit(self._rng.normal(self._mu, self._std), self._noise) for _ in range(len(self._bandits))]
    ax1.grid()
    ax1.legend(['alpha = {}'.format(i) for i in alpha], loc='upper right')
    ax1.set_title('Average reward vs. alpha (learning rate)')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Average reward')
    # plt.show()


if __name__ == "__main__":

    NUM_ARMS = 5
    SIG = 5.0
    AMP = 2.0
    INTERVAL = 5000
    EPOCH = 100000
    SEED = 2020

    toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG, seed=SEED)
    print(toy_bandit.centers)
    print("Testing epsilon greedy method ... ")
    test_epsilon_greedy(toy_bandit)
    print("Testing UCB method ... ")
    test_ucb_select(toy_bandit)

    ALPHA = [0, 0.05, 0.1, 0.2, 1.0]
    # ALPHA = [0.1]
    print("Testing different alpha (learning rate) in unstable bandit ... ")
    unstable_bandit = UnstableGaussianBandit(num_arms=NUM_ARMS, sig=SIG, change_interval=INTERVAL, change_amp=AMP, seed=SEED)
    test_unstable_bandit(unstable_bandit, alpha=ALPHA, n_epoch=EPOCH)
    plt.show()
Ejemplo n.º 8
0
            regret[i][:temp, :] = reg_i[:temp, :]
        else:
            temp = 0
        start_it = np.min((start_it, temp))
    if start_it == (n_runs + 1):
        start_it = 0
    if start_it == 0:
        regret = {i: np.zeros((n_runs, H * T)) for i in range(len(policies))}
else:
    start_it = 0

# switch_sequences = np.random.randint(0, M, (n_runs-start_it, H)).astype('int').reshape((n_runs-start_it, H))
switch_sequences = np.array([[(i + j) % M for j in range(H)]
                             for i in range(n_runs)])
# switch_sequences = np.zeros((n_runs, H)).astype('int')
bandits = [GaussianBandit(mu) for mu in models]
datasets = [build_dataset(bandits, T, H, seq) for seq in switch_sequences]
agent = {i: SwitchingAgent(bandits, pi, T) for i, pi in enumerate(policies)}
FPR = np.zeros((n_runs, T - K))
TPR = np.zeros((n_runs, T - K))
NEG = np.zeros((n_runs, T - K))

for it in tqdm(range(start_it, n_runs)):
    data = datasets[it - start_it]
    switch_seq = switch_sequences[it - start_it]
    for i, pi in enumerate(policies):
        agent[i].run(data, switch_seq)
        regret[i][it] = agent[i].regret
        if pi.__str__() == 'KLUCB-RB':
            FPR[it] = agent[i].fp_rate
            TPR[it] = agent[i].tp_rate
    ax2 = fig.add_subplot(212)
    fig.subplots_adjust(wspace=None, hspace=0.3)
    for i in range(num_arms):
        ax1.plot(ls[i])
    ax1.grid()
    ax1.set_title('Rewards of Each Arm')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Reward')
    # reward mean change curve
    for i in range(num_arms):
        ax2.plot(cntr[i])
    ax2.grid()
    ax2.set_title('Mean of Each Arm of Gaussian Bandit (showing unstability)')
    ax2.set_xlabel('Iterations')
    ax2.set_ylabel('Current Mean of Reward')
    # plt.show()


if __name__ == "__main__":

    # test normal Gaussian Bandit
    toy_bandit = GaussianBandit(num_arms=8)
    print(toy_bandit.centers)
    print(toy_bandit.get_reward(1))
    print(toy_bandit.get_reward(2))

    # test unstable Gaussian Bandit
    toy_bandit = UnstableGaussianBandit(num_arms=3, sig=2.0, change_interval=100)
    draw_unstable_bandit(toy_bandit)
    plt.show()
Ejemplo n.º 10
0
        # aver_reward_list.append(toy_bandit.centers[0])
        for each_arm in range(num_arms):
            if each_arm == arm_id:
                act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)] + 1
            else:
                act_selection_aver[each_arm, i] = act_selection_aver[each_arm, max(i-1, 0)]
    act_selection_aver = act_selection_aver / (np.array([[range(n_epoch)] * num_arms])[0,:,:] + 1)

    return q_list, aver_reward_list, act_selection_aver


if __name__ == "__main__":

    NUM_ARMS = 5
    SIG = 1.0
    toy_bandit = GaussianBandit(num_arms=NUM_ARMS, sig=SIG)
    print(toy_bandit.centers)
    q_list, aver_reward_list, act_selection_aver \
                    = bandit_algorithm(toy_bandit, n_epoch=200, warm_up=True, epsilon=0.1)
    fig = plt.figure(figsize=(10,10))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    fig.subplots_adjust(wspace=None, hspace=0.3)
    ax1.plot(aver_reward_list)
    ax1.grid()
    ax1.set_title('Average reward vs. Iter')
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Average reward')
    for each_arm in range(NUM_ARMS):
        ax2.plot(act_selection_aver[each_arm])
    ax2.grid()