def construct_non_parametric_MAB(): arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBeta(0.5, 0.5, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmBeta(1., 3., random_state=np.random.randint(1, 312414)) arm4 = arms.ArmExp(1., random_state=np.random.randint(1, 312414)) arm5 = arms.ArmFinite(np.array([0., 0.1, 0.5, 0.8]), np.array([0.2, 0.3, 0.4, 0.1])) return [arm1, arm2, arm3, arm4, arm5]
C=np.sum((p_star-list_pbis)/kl(list_pbis,p_star)) # we calcul C oracle = C*np.log(list_t) plt.figure(3) plt.clf() plt.plot(list_t, R[0], label='Expected regret of UCB1') plt.plot(list_t, R[1], label='Expected regret of TS') plt.plot(list_t, R[2], label='Eps_Greedy') plt.plot(list_t,oracle, label='Oracle') # we display plt.legend() ## Question 1: arm1 = arms.ArmBernoulli(0.30, random_state=np.random.randint(1, 312414)) arm2 = arms.ArmBeta(0.20, 0.30, random_state=np.random.randint(1, 312414)) arm3 = arms.ArmExp(0.25, random_state=np.random.randint(1, 312414)) arm4 = arms.ArmFinite(np.array([0.3,0.5,0.2]), np.array([0.5,0.1,0.4]), random_state=np.random.randint(1, 312414)) MAB = [arm1, arm2, arm3, arm4] def TS_non_binarity(T,MAB): nb_arms = len(MAB) rew, draw = np.zeros(T), np.zeros(T) N = np.zeros(nb_arms) # number of draws of arms up to time t S = np.zeros_like(N) # sum of rewards gathered up to time t tau = np.zeros(nb_arms) for t in range(T): for a in range(nb_arms): if N[a] == 0:
plt.figure(1) x = np.arange(1, T + 1) plt.plot(x, reg1, label='UCB') plt.plot(x, reg2, label='Thompson') # plt.plot(x, reg3, label='Best arm') plt.plot(x, oracle, label='Oracle') plt.legend(['UCB', 'Thompson', 'Oracle']) plt.xlabel('Rounds') plt.ylabel('Cumulative Regret') # plt.title('First problem') plt.show() # (Expected) regret curve for UCB and Thompson Sampling npm_1 = arms.ArmBeta(0.7, 0.6) npm_2 = arms.ArmBeta(0.5, 0.6) npm_3 = arms.ArmExp(0.7) npm_4 = arms.ArmExp(0.35) NPM = [npm_1, npm_2, npm_3] means = [el.mean for el in NPM] mu_max = np.max(means) rew4, draws4 = avg_bandit_game(NPM, T, strategy='ucb1', runs=100) reg4 = mu_max * np.arange(1, T + 1) - np.cumsum(rew4) rew5, draws5 = avg_bandit_game(NPM, T, strategy='thompson', runs=100) reg5 = mu_max * np.arange(1, T + 1) - np.cumsum(rew5) plt.figure(1) x = np.arange(1, T + 1) plt.plot(x, reg4, label='UCB') plt.plot(x, reg5, label='Thomson')
def main(): # Build your own bandit problem random_state = np.random.randint(1, 312414) delta = 0.1 # Bernoulli loss arm arm1 = arms.ArmBernoulli(0.50, random_state=random_state) arm2 = arms.ArmBernoulli(0.50, random_state=random_state) arm3 = arms.ArmBernoulli(0.50, random_state=random_state) arm4 = arms.ArmBernoulli(0.50, random_state=random_state) arm5 = arms.ArmBernoulli(0.50, random_state=random_state) arm6 = arms.ArmBernoulli(0.50, random_state=random_state) arm7 = arms.ArmBernoulli(0.50, random_state=random_state) arm8 = arms.ArmBernoulli(0.50, random_state=random_state) arm9 = arms.ArmBernoulli(0.50 - delta, random_state=random_state) arm10_1 = arms.ArmBernoulli(0.50 + delta, random_state=random_state) arm10_2 = arms.ArmBernoulli(0.50 - 4 * delta, random_state=random_state) arm11 = arms.ArmPieceConstant(mean=0.5, delta=0.2, fre=500, random_state=0) arm12 = arms.ArmPieceIncrease(lower=0, upper=1, delta=0.1, prob=0.001, random_state=0) arm13 = arms.ArmPieceDecrease(lower=0, upper=1, delta=0.1, prob=0.001, random_state=0) arm14 = arms.ArmBeta(a=2, b=2, random_state=0) arm15 = arms.ArmBeta(a=0.5, b=0.5, random_state=0) MAB1 = [ arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9, arm10_1, arm11, arm12, arm13, arm14, arm15 ] MAB2 = [ arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9, arm10_2, arm11, arm12, arm13, arm14, arm15 ] #reward arm #arm9_ = arms.ArmBernoulli(0.50+delta, random_state=random_state) #arm10_1_ = arms.ArmBernoulli(0.50-delta, random_state=random_state) #arm10_2_ = arms.ArmBernoulli(0.50+4*delta, random_state=random_state) #MAB1_ = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9_, arm10_1_] #MAB2_ = [arm1, arm2, arm3, arm4, arm5, arm6, arm7, arm8, arm9_, arm10_2_] # bandit : set of arms T = 1e4 K = len(MAB1) change_time = int(T / 2) loss_sequence = produce_loss_sequence(env=MAB1, T=T, env_change=True, new_env=MAB2, change_time=change_time) single_global_best = np.min(np.sum(loss_sequence, axis=0)) single_global_best_2 = np.min(np.sum(loss_sequence[:int(T / 2)], axis=0)) etas = [10**i for i in np.linspace(-2.5, 0, 8)] repeat = 50 regrets_ix = [] regrets_exp3 = [] regrets_exp3p = [] #regrets = [] #regrets_2 = [] for eta in etas: tmp_ix = [[], []] tmp_exp3 = [[], []] tmp_exp3p = [[], []] #gamma = np.min([0.6, 2*np.sqrt(0.6 * K * np.log(K) / T)]) gamma = 0.005 #alpha = 2 * np.sqrt(np.log(K * T / 0.01)) #beta = 0.006 beta = gamma / K for _ in range(repeat): _, loss = EXP3_P(loss_sequence=loss_sequence, eta=eta, gamma=gamma, beta=beta, T=T) tmp_exp3p[0].append(np.sum(loss) - single_global_best) tmp_exp3p[1].append( np.sum(loss[:change_time]) - single_global_best_2) _, loss = EXP3(loss_sequence=loss_sequence, eta=eta, gamma=gamma, T=T) tmp_exp3[0].append(np.sum(loss) - single_global_best) tmp_exp3[1].append( np.sum(loss[:change_time]) - single_global_best_2) _, loss = EXP3_IX(loss_sequence=loss_sequence, eta=eta, gamma=gamma, T=T) tmp_ix[0].append(np.sum(loss) - single_global_best) tmp_ix[1].append(np.sum(loss[:change_time]) - single_global_best_2) #print('eta: %0.3f, regret: %f' % (eta, np.mean(tmp))) regrets_ix.append(tmp_ix) regrets_exp3.append(tmp_exp3) regrets_exp3p.append(tmp_exp3p) regrets_ix = np.array(regrets_ix) regrets_exp3p = np.array(regrets_exp3p) regrets_exp3 = np.array(regrets_exp3) std_ix = np.std(regrets_ix, axis=2).T mean_ix = np.mean(regrets_ix, axis=2).T std_exp3 = np.std(regrets_exp3, axis=2).T mean_exp3 = np.mean(regrets_exp3, axis=2).T std_exp3p = np.std(regrets_exp3p, axis=2).T mean_exp3p = np.mean(regrets_exp3p, axis=2).T means = [mean_exp3, mean_exp3p, mean_ix] stds = [std_exp3, std_exp3p, std_ix] algos = ['EXP3', 'EXP3.P', 'EXP3-IX'] # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(14, 6)) for i in range(len(algos)): ax1.errorbar(etas, means[i][1], yerr=stds[i][1], fmt='-o', label=algos[i]) ax1.set_xscale('log') ax1.set_xlabel(r'$\eta$ multiplier', fontsize=14) ax1.set_ylabel(r'Regret at $T/2$', fontsize=14) ax1.legend() for i in range(len(algos)): ax2.errorbar(etas, means[i][0], yerr=stds[i][0], fmt='-o', label=algos[i]) ax2.set_xscale('log') ax2.set_xlabel(r'$\eta$ multiplier', fontsize=14) ax2.set_ylabel(r'Regret at $T$', fontsize=14) ax2.legend() plt.show()