def history(means, algorithm, seed, horizon, epsilon=0.02): ''' Returns regret history from T = 0 to T = horizon ''' np.random.seed(seed) bandit = BanditInstance(means) optimal_arm = np.amax(means) reward_sum = 0 regret = np.zeros(horizon + 1) strategy = 0 if algorithm == 'epsilon-greedy': strategy = EpsilonGreedy(means.shape[0], epsilon) elif algorithm == 'ucb': strategy = UCB(means.shape[0]) elif algorithm == 'kl-ucb': strategy = KLUCB(means.shape[0]) elif algorithm == 'thompson-sampling': strategy = ThompsonSampling(means.shape[0]) elif algorithm == 'thompson-sampling-with-hint': strategy = ThompsonSamplingWithHint(means.shape[0], np.sort(means)) for i in range(1, horizon + 1): arm = strategy.getArm() reward = bandit.pull(arm) strategy.getReward(arm, reward) reward_sum += reward regret[i] = i * optimal_arm - reward_sum return regret
experiment = Experiment(1) experiment.log_code() # Experiment 1 N = 50 epsilon = .3 simulations = 10000 T = 400 algorithms = [ GeneralCausal(truncate='None'), ParallelCausal(), SuccessiveRejects(), AlphaUCB(2), ThompsonSampling() ] m_vals = range(2, N, 2) regret, models = regret_vs_m(algorithms, m_vals, N, T, epsilon, simulations=simulations) experiment.plot_regret(regret, m_vals, "m", algorithms, legend_loc="lower right")
return m_vals,regret,models experiment = Experiment(4) experiment.log_code() N = 50 N1_vals = range(1,N,3) pz = .4 q = (0.00001,0.00001,.4,.65) epsilon = .3 simulations = 10000 T = 400 algorithms = [SuccessiveRejects(),GeneralCausal(),AlphaUCB(2),ThompsonSampling()] epsilon = .3 pY = ParallelConfounded.pY_epsilon_best(q,pz,epsilon) m_vals,regret,models = regret_vs_m_general(algorithms,N1_vals,N,T,pz,pY,q,epsilon,simulations = simulations) experiment.plot_regret(regret,m_vals,"m",algorithms,legend_loc = "lower right",legend_extra = [ParallelCausal])