def sample_reward(self, arm): """Return the sampled reward of the arm if it is a Bernoulli arm. Otherwise, draw a reward from a Bernoulli distribution with the sampled reward as a parameter. Parameters ---------- arm : arms.AbstractArm Arm to sample a reward from. Returns ------- r : int Reward (0 or 1). """ if isinstance(arm, arms.ArmBernoulli): r = int(arm.sample()) # Bernoulli trial if the arm is not bernoulli else: r_observed = arm.sample() bernoulli = arms.ArmBernoulli(r_observed) r = int(bernoulli.sample()) return r
def construct_non_parametric_MAB(): arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBeta(0.5, 0.5, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBeta(1., 3., random_state=np.random.randint(1, 312414)) arm4 = arms.ArmExp(1., random_state=np.random.randint(1, 312414)) arm5 = arms.ArmFinite(np.array([0., 0.1, 0.5, 0.8]), np.array([0.2, 0.3, 0.4, 0.1])) return [arm1, arm2, arm3, arm4, arm5]
def strong_obs_graph(n_nodes, alpha, beta, graph_arms=None): """Generates a strongly connected graph""" # alpha influences the number of self edges removed. alpha=1 --> all self-edges will be removed # beta influences the number of "peer" edges removed. beta=1 --> only peer edges will be removed G = nx.DiGraph() m = beta / n_nodes - 2 if graph_arms == None: graph_arms = [arms.ArmBernoulli(0.5) for _ in range(n_nodes)] # Step 1: generate fully connected graph for nodeID in range(n_nodes): G.add_node(nodeID, arm=graph_arms[nodeID]) for node1 in range(n_nodes): for node2 in range(n_nodes): G.add_edge(node1, node2) for nodeID in G.nodes(): u = random.random() if u < alpha: G.remove_edge(nodeID, nodeID) elif u < alpha + beta: k = int((u - alpha) / m) # Let's find k+1 elements at random among edges (j,i) edges = list(range(n_nodes)) edges.remove(nodeID) edgesToRemove = random.sample(edges, k + 1) for neighID in edgesToRemove: G.remove_edge(neighID, nodeID) return G
def sample(self, arm): if isinstance(arm, arms.ArmBernoulli): reward = arm.sample() else: reward_obs = arm.sample() bernoulli_draw = arms.ArmBernoulli(reward_obs) reward = bernoulli_draw.sample() return reward
def __init__(self, means=None, n_arms=None): Bandit.__init__(self) if means is None: if n_arms is None: self.n_arms = 10 else: self.n_arms = n_arms self.means = [1 / (k + 1) for k in range(1, self.n_arms + 1)] else: self.means = means self.n_arms = len(self.means) self.arms = [arms.ArmBernoulli(p) for p in self.means]
def __init__(self, nr_arms, seed): """Initializes randomly Bernoulli arms given the number of arm """ self.nr_arms_ = nr_arms self.seed_ = seed np.random.seed(self.seed_) bernoulli_param = [np.random.rand() for i in range(self.nr_arms_)] random_seed = [ np.random.randint(1, 312414) for i in range(self.nr_arms_) ] arms_ = [ arms.ArmBernoulli(p=p, random_state=seed) for (p, self.seed_) in zip(bernoulli_param, random_seed) ] super(BernoulliBandit, self).__init__(arms_)
def TS(T, MAB): """ T : Horizon MAB : list of arms n_simu : number of simulations """ rews = np.zeros(T) draws = np.zeros(T) K = len(MAB) nb_pulls = [1] * len(MAB) cumreward_arm = [] for k in range(K): reward = int(MAB[k].sample()) cumreward_arm.append(reward) for t in range(1, T + 1): mu = [ np.random.beta(cumreward_arm[i] + 1, nb_pulls[i] - cumreward_arm[i] + 1) for i in range(len(MAB)) ] best_arm = np.argmax(mu) sample = MAB[best_arm].sample() if isinstance(sample, bool): reward = int(sample) success = reward else: reward = sample[0] arms_object = arms.ArmBernoulli(p=reward) success = int(arms_object.sample()) #Update nb_pulls[best_arm] += 1 cumreward_arm[best_arm] += success rews[t - 1] = reward draws[t - 1] = best_arm return rews, draws
regret_ucb /= n_samples regret_ts /= n_samples regret_general_ts /= n_samples # regret_naive /= n_samples opt = pd.Series(np.linspace(1, time_horizon, time_horizon)) * p_star regret_ucb += opt regret_ts += opt regret_general_ts += opt # regret_naive += opt regret_oracle = pd.Series([bandit.complexity() * np.log(t) for t in range(time_horizon)]) fig = plt.figure() regret_ucb.plot(label='UCB regret') regret_ts.plot(label='Bernoulli Thompson Sampling regret') regret_general_ts.plot(label='General Thompson Sampling regret') # regret_naive.plot(label='Naive algorithm regret') regret_oracle.plot(label='Oracle regret') plt.legend(loc=4) plt.title('Regret curves') fig.savefig(figtitle + ".png") start = time.time() bandit = bandits.Bandit([arms.ArmExp(.4), arms.ArmExp(.5), arms.ArmBernoulli(.8), arms.ArmBernoulli(.9)]) q2(bandit) print(time.time() - start)
import numpy as np import arms import matplotlib.pyplot as plt from ucb import UCB1, TS # Build your own bandit problem # this is an example, please change the parameters or arms! arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.25, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.20, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.10, random_state=np.random.randint(1, 312414)) MAB = [arm1, arm2, arm3, arm4] # bandit : set of arms nb_arms = len(MAB) means = [el.mean for el in MAB] # Display the means of your bandit (to find the best) print('means: {}'.format(means)) mu_max = np.max(means) # Comparison of the regret on one run of the bandit algorithm # try to run this multiple times, you should observe different results T = 5000 # horizon rew1, draws1 = UCB1(T, MAB) reg1 = mu_max * np.arange(1, T + 1) - np.cumsum(rew1)
def bernouilli_MAB(list_p): MAB = [] for k in range(0, list_p.shape[0]): MAB.append(arms.ArmBernoulli(list_p[k], random_state=np.random.randint(1, 312414))) return MAB
rewards[t] = reward return rewards, draws def avg_bandit_game(bandits, T, strategy='thompson', rho=.2, runs=20): return np.array([ np.array(bandit_game(bandits, T, strategy=strategy, rho=rho))[:, :, 0] for i in range(runs) ]).mean(axis=0) # Build your own bandit problem # this is an example, please change the parameters or arms! arm1 = arms.ArmBernoulli(0.65, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.5, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.45, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.60, random_state=np.random.randint(1, 312414)) MAB = [arm1, arm2, arm3, arm4] arm21 = arms.ArmBernoulli(0.43, random_state=np.random.randint(1, 312414)) arm22 = arms.ArmBernoulli(0.56, random_state=np.random.randint(1, 312414)) arm23 = arms.ArmBernoulli(0.51, random_state=np.random.randint(1, 312414)) arm24 = arms.ArmBernoulli(0.55, random_state=np.random.randint(1, 312414)) MAB2 = [arm21, arm22, arm23, arm24] for mab in [MAB, MAB2]: # bandit : set of arms
return x * np.log(x / y) + (1 - x) * np.log((1 - x) / (1 - y)) if __name__ == "__main__": # Comparison of the regret on one run of the bandit algorithm # try to run this multiple times, you should observe different results T = 6000 # horizon # Build your own bandit problem # random_state = np.random.randint(1, 312414) random_state = 0 # this is an example, please change the parameters or arms! arm1 = arms.ArmBernoulli(0.30, random_state=random_state) arm2 = arms.ArmBernoulli(0.25, random_state=random_state) arm3 = arms.ArmBernoulli(0.20, random_state=random_state) arm4 = arms.ArmBernoulli(0.10, random_state=random_state) MAB = [arm1, arm2, arm3, arm4] mu_max = max(arm.mean for arm in MAB) # (Expected) regret curve for UCB and Thompson Sampling rew1, draws1 = UCB1(T, MAB) reg1 = mu_max * np.arange(1, T + 1) - np.cumsum(rew1) rew2, draws2 = TS(T, MAB) reg2 = mu_max * np.arange(1, T + 1) - np.cumsum(rew2) # rew3, draws3 = naive_strategy(T, MAB) # reg3 = mu_max * np.arange(1, T + 1) - np.cumsum(rew3)
def main(): # Build your own bandit problem random_state = np.random.randint(1, 312414) delta = 0.1 # Bernoulli loss arm arm1 = arms.ArmBernoulli(0.50, random_state=random_state) arm2 = arms.ArmBernoulli(0.50, random_state=random_state) arm3 = arms.ArmBernoulli(0.50, random_state=random_state) arm4 = arms.ArmBernoulli(0.50, random_state=random_state) arm5 = arms.ArmBernoulli(0.50, random_state=random_state) arm6 = arms.ArmBernoulli(0.50, random_state=random_state) arm7 = arms.ArmBernoulli(0.50, random_state=random_state) arm8 = arms.ArmBernoulli(0.50, random_state=random_state) arm9 = arms.ArmBernoulli(0.50 - delta, random_state=random_state) arm10_1 = arms.ArmBernoulli(0.50 + delta, random_state=random_state) arm10_2 = arms.ArmBernoulli(0.50 - 4 * delta, random_state=random_state) arm11 = arms.ArmPieceConstant(mean=0.5, delta=0.2, fre=500, random_state=0) arm12 = arms.ArmPieceIncrease(lower=0, upper=1, delta=0.1, prob=0.001, random_state=0) arm13 = arms.ArmPieceDecrease(lower=0, upper=1, delta=0.1, prob=0.001, random_state=0) arm14 = arms.ArmBeta(a=2, b=2, random_state=0) arm15 = arms.ArmBeta(a=0.5, b=0.5, random_state=0) MAB1 = [ arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9, arm10_1, arm11, arm12, arm13, arm14, arm15 ] MAB2 = [ arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9, arm10_2, arm11, arm12, arm13, arm14, arm15 ] #reward arm #arm9_ = arms.ArmBernoulli(0.50+delta, random_state=random_state) #arm10_1_ = arms.ArmBernoulli(0.50-delta, random_state=random_state) #arm10_2_ = arms.ArmBernoulli(0.50+4*delta, random_state=random_state) #MAB1_ = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9_, arm10_1_] #MAB2_ = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9_, arm10_2_] # bandit : set of arms T = 1e4 K = len(MAB1) change_time = int(T / 2) loss_sequence = produce_loss_sequence(env=MAB1, T=T, env_change=True, new_env=MAB2, change_time=change_time) single_global_best = np.min(np.sum(loss_sequence, axis=0)) single_global_best_2 = np.min(np.sum(loss_sequence[:int(T / 2)], axis=0)) etas = [10**i for i in np.linspace(-2.5, 0, 8)] repeat = 50 regrets_ix = [] regrets_exp3 = [] regrets_exp3p = [] #regrets = [] #regrets_2 = [] for eta in etas: tmp_ix = [[], []] tmp_exp3 = [[], []] tmp_exp3p = [[], []] #gamma = np.min([0.6, 2*np.sqrt(0.6 * K * np.log(K) / T)]) gamma = 0.005 #alpha = 2 * np.sqrt(np.log(K * T / 0.01)) #beta = 0.006 beta = gamma / K for _ in range(repeat): _, loss = EXP3_P(loss_sequence=loss_sequence, eta=eta, gamma=gamma, beta=beta, T=T) tmp_exp3p[0].append(np.sum(loss) - single_global_best) tmp_exp3p[1].append( np.sum(loss[:change_time]) - single_global_best_2) _, loss = EXP3(loss_sequence=loss_sequence, eta=eta, gamma=gamma, T=T) tmp_exp3[0].append(np.sum(loss) - single_global_best) tmp_exp3[1].append( np.sum(loss[:change_time]) - single_global_best_2) _, loss = EXP3_IX(loss_sequence=loss_sequence, eta=eta, gamma=gamma, T=T) tmp_ix[0].append(np.sum(loss) - single_global_best) tmp_ix[1].append(np.sum(loss[:change_time]) - single_global_best_2) #print('eta: %0.3f, regret: %f' % (eta, np.mean(tmp))) regrets_ix.append(tmp_ix) regrets_exp3.append(tmp_exp3) regrets_exp3p.append(tmp_exp3p) regrets_ix = np.array(regrets_ix) regrets_exp3p = np.array(regrets_exp3p) regrets_exp3 = np.array(regrets_exp3) std_ix = np.std(regrets_ix, axis=2).T mean_ix = np.mean(regrets_ix, axis=2).T std_exp3 = np.std(regrets_exp3, axis=2).T mean_exp3 = np.mean(regrets_exp3, axis=2).T std_exp3p = np.std(regrets_exp3p, axis=2).T mean_exp3p = np.mean(regrets_exp3p, axis=2).T means = [mean_exp3, mean_exp3p, mean_ix] stds = [std_exp3, std_exp3p, std_ix] algos = ['EXP3', 'EXP3.P', 'EXP3-IX'] # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(14, 6)) for i in range(len(algos)): ax1.errorbar(etas, means[i][1], yerr=stds[i][1], fmt='-o', label=algos[i]) ax1.set_xscale('log') ax1.set_xlabel(r'$\eta$ multiplier', fontsize=14) ax1.set_ylabel(r'Regret at $T/2$', fontsize=14) ax1.legend() for i in range(len(algos)): ax2.errorbar(etas, means[i][0], yerr=stds[i][0], fmt='-o', label=algos[i]) ax2.set_xscale('log') ax2.set_xlabel(r'$\eta$ multiplier', fontsize=14) ax2.set_ylabel(r'Regret at $T$', fontsize=14) ax2.legend() plt.show()
def construct_Bernoulli_MAB(difficulty='moderate'): if difficulty == 'moderate': arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.25, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.20, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.10, random_state=np.random.randint(1, 312414)) elif difficulty == 'hard': arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.29, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.29, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.29, random_state=np.random.randint(1, 312414)) elif difficulty == 'easy': arm1 = arms.ArmBernoulli(0.90, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.15, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.10, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.05, random_state=np.random.randint(1, 312414)) else: raise ValueError('Difficulty {} is not supported'.format(difficulty)) return [arm1, arm2, arm3, arm4]
opt = pd.Series(np.linspace(1, time_horizon, time_horizon)) * p_star regret_ucb += opt regret_ts += opt regret_general_ts += opt # regret_naive += opt regret_oracle = pd.Series( [bandit.complexity() * np.log(t) for t in range(time_horizon)]) fig = plt.figure() regret_ucb.plot(label='UCB regret') regret_ts.plot(label='Bernoulli Thompson Sampling regret') regret_general_ts.plot(label='General Thompson Sampling regret') # regret_naive.plot(label='Naive algorithm regret') regret_oracle.plot(label='Oracle regret') plt.legend(loc=4) plt.title('Regret curves') fig.savefig(figtitle + ".png") start = time.time() bandit = bandits.Bandit([ arms.ArmExp(.4), arms.ArmExp(.5), arms.ArmBernoulli(.8), arms.ArmBernoulli(.9) ]) q2(bandit) print(time.time() - start)
import numpy as np import arms import matplotlib.pyplot as plt # Build your own bandit problem # First example (easy) arm1 = arms.ArmBernoulli(0.80, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.10, random_state=np.random.randint(1, 312414)) MAB1 = [arm1, arm2, arm3] # Second example arm1 = arms.ArmBernoulli(0.48, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBernoulli(0.47, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBernoulli(0.50, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmBernoulli(0.49, random_state=np.random.randint(1, 312414)) arm5 = arms.ArmBernoulli(0.49, random_state=np.random.randint(1, 312414)) MAB2 = [arm1, arm2, arm3, arm4, arm5] # Bandit characteristics nb_arms1 = len(MAB1) nb_arms2 = len(MAB2) means1 = [el.mean for el in MAB1] means2 = [el.mean for el in MAB2] # Display the means of your bandit (to find the best) print('means1: {}'.format(means1)) mu_max1 = np.max(means1) print('means2: {}'.format(means2)) mu_max2 = np.max(means2)