def __init__(self): self.payoutModifier1 = 1.0 self.payoutModifier2 = 2.0 self.payoutModifier3 = 3.0 self.iterations = 10000 self.means = [10, 10, 10] self.bandits = [ bandit.Bandit(self.payoutModifier1), bandit.Bandit(self.payoutModifier2), bandit.Bandit(self.payoutModifier3) ] self.data = np.empty(self.iterations)
def __init__(self, net, n_envs, n_bandits, bandit_prob, bootstrap=True): self.net = net # PyTorch Module self.pi_space = n_bandits self.prob = bandit_prob self.n_envs = n_envs self.softmax = bandit.Bandit().softmax self.bootstrap = bootstrap
def run_experiment(mu, N, agent): """ Runs the expirement Inputs: mu - numpy array of means for bandaits N - number of turns agent - a class of agent with methods choose_bandit and update. Output """ # Make bandits n_bandits = len(mu) bandits = list() for i in range(n_bandits): bandits.append(bandit.Bandit(mu[i])) # Reward vector (could leave this to the agent) rewards = np.zeros(N) # Run simulation for i in range(N): j = agent.choose_bandit() reward = bandits[j].pull() agent.update(reward) rewards[i] = reward # Calculate average cumulative_average = np.cumsum(rewards) / (np.arange(N) + 1) return (cumulative_average)
def form(): """ Provide form with cumulated trial and success inputs for multiple options, return options with suggested budget share for next period """ if request.method == 'POST': entries = [value for value in list(request.form.values()) if value] num_options = int(len(entries) / 2) options = pd.DataFrame([{ 'option': str(i + 1) } for i in range(num_options)]) trials = [int(entries[i * 2]) for i in range(num_options)] successes = [int(entries[i * 2 + 1]) for i in range(num_options)] bandit = ban.Bandit(num_options=num_options, memory=False) for i in range(num_options): bandit.add_results(option_id=i, trials=trials[i], successes=successes[i]) shares = choose(bandit=bandit, accelerate=False) options = format_results(options, shares) records = options.to_dict('records') columns = options.columns.values save_plot(bandit) return render_template('form_result.html', records=records, columns=columns, plot='/static/images/plot.png') return render_template('form.html')
def __init__(self, epsilon): self.payoutModifier1 = 1.0 self.payoutModifier2 = 2.0 self.payoutModifier3 = 3.0 self.iterations = 10000 self.epsilon = epsilon self.results = [0, 0, 0] self.bandits = [ bandit.Bandit(self.payoutModifier1), bandit.Bandit(self.payoutModifier2), bandit.Bandit(self.payoutModifier3) ] self.data = np.empty(self.iterations)
def new_envs(self): """ Makes a new list of bandit environments. """ envs = [] for i in range(self.n_envs): rand = np.random.random() probs = [self.prob, 1 - self.prob] if rand <= 0.5 else [1 - self.prob, self.prob] envs.append(bandit.Bandit(probs=probs)) return envs
def main(): print("---- Starting.... ----") Nexp = 1000 Npulls = 2000 #=========== Epsilon Greedy Experiments (Nonstationary) ========== if(1): avg_outcome_RC1 = np.zeros(Npulls) avg_optimal_arm_RC1 = np.zeros(Npulls) avg_outcome_eps1 = np.zeros(Npulls) avg_optimal_arm_eps1 = np.zeros(Npulls) for i in range(Nexp): bandit = bndt.Bandit(10) #10 armed bandit outcome_RC, arms_RC = experiment_RC(bandit,Npulls, alpha=0.1, beta=0.2) avg_outcome_RC1 += outcome_RC avg_optimal_arm_RC1 += arms_RC bandit = bndt_eps.Bandit(10) #10 armed bandit outcome_eps1, arms_eps1 = experiment_epsilonGreedy(bandit, 0.1, Npulls) avg_outcome_eps1 += outcome_eps1 avg_optimal_arm_eps1 += arms_eps1 avg_outcome_RC1 /= np.float(Nexp) avg_optimal_arm_RC1 /= np.float(Nexp) avg_outcome_eps1 /= np.float(Nexp) avg_optimal_arm_eps1 /= np.float(Nexp) # plot results plt.plot(avg_outcome_RC1,label="RC: a=0.1 b=0.2") plt.plot(avg_outcome_eps1,label="Eps: eps=0.1 a=1/k") plt.legend(loc=0) plt.title('Average Reward: Eps-Greedy vs Reinf. Comp. (Stationary Problem)') plt.ylabel('Average Reward') plt.xlabel('Number of pulls/plays') plt.figure() plt.plot(avg_optimal_arm_RC1*100.0, label='RC a=0.1 b=0.2') plt.plot(avg_optimal_arm_eps1*100.0, label='Eps eps=0.1 a=1/k') plt.ylim(0,100) plt.legend(loc=0) plt.title('Average %Optimal Arm Chosen: Eps-Greedy vs Reinf. Comp.(Stationary Problem)') plt.xlabel('Number of pulls/plays') plt.ylabel('Percent Optimal Arm') plt.show()
def add_daily_results(data, num_options, memory, shape, cutoff, cut_level): """ For each day, add a period with its option results to the Bandit """ bandit = ban.Bandit(num_options, memory, shape, cutoff, cut_level) for i in range(cutoff + 1): bandit.add_period() daily_results = data.loc[data['date'] == datetime.date.today() - datetime.timedelta(days=cutoff - i)] for j in range(len(daily_results)): bandit.add_results(int(daily_results.iloc[j]['option_id']), daily_results.iloc[j]['trials'], daily_results.iloc[j]['successes']) return bandit
def main(): timesteps = int(sys.argv[1]) b = bandit.Bandit() regret = 0. for t in range(timesteps): # Choose an arm a = 0 # Pull the arm, obtain a reward ret = b.trigger(a) regret += b.opt() - ret # Learn from a and ret print('Reward', ret, 'regret', regret) continue
def create(): return [bandit.Bandit(1.0), bandit.Bandit(2.0), bandit.Bandit(3.0)]
def main(): print("---- Starting.... ----") Nexp = 100 Npulls = 1000 #=========== Epsilon Greedy Experiments ========== if(1): avg_outcome_eps0p0 = np.zeros(Npulls) avg_outcome_eps0p01 = np.zeros(Npulls) avg_outcome_eps0p1 = np.zeros(Npulls) avg_optimal_arm_eps0p0 = np.zeros(Npulls) avg_optimal_arm_eps0p01 = np.zeros(Npulls) avg_optimal_arm_eps0p1 = np.zeros(Npulls) for i in range(Nexp): bandit = bndt.Bandit(10) #10 armed bandit outcome_eps0p0, arms_eps0p0 = experiment_epsilonGreedy(bandit,0.0,Npulls) avg_outcome_eps0p0 += outcome_eps0p0 avg_optimal_arm_eps0p0 += arms_eps0p0 bandit = bndt.Bandit(10) #10 armed bandit outcome_eps0p01, arms_eps0p01 = experiment_epsilonGreedy(bandit,0.01,Npulls) avg_outcome_eps0p01 += outcome_eps0p01 avg_optimal_arm_eps0p01 += arms_eps0p01 bandit = bndt.Bandit(10) #10 armed bandit outcome_eps0p1, arms_eps0p1 = experiment_epsilonGreedy(bandit,0.1,Npulls) avg_outcome_eps0p1 += outcome_eps0p1 avg_optimal_arm_eps0p1 += arms_eps0p1 avg_outcome_eps0p0 /= np.float(Nexp) avg_outcome_eps0p01 /= np.float(Nexp) avg_outcome_eps0p1 /= np.float(Nexp) avg_optimal_arm_eps0p0 /= np.float(Nexp) avg_optimal_arm_eps0p01 /= np.float(Nexp) avg_optimal_arm_eps0p1 /= np.float(Nexp) # plot results plt.plot(avg_outcome_eps0p0,label="eps = 0.0") plt.plot(avg_outcome_eps0p01,label="eps = 0.01") plt.plot(avg_outcome_eps0p1,label="eps = 0.1") plt.ylim(0,2) plt.legend() plt.title('N-arm bandit problem simulation (N=10) using epsilon-greedy') plt.ylabel('Average Reward') plt.xlabel('Number of pulls/plays') plt.figure() plt.plot(avg_optimal_arm_eps0p0*100.0, label='eps = 0.0') plt.plot(avg_optimal_arm_eps0p01*100.0, label='eps = 0.01') plt.plot(avg_optimal_arm_eps0p1*100.0, label='eps = 0.1') plt.ylim(0,100) plt.legend(loc=0) plt.title('Average Percent Optimal Arm Chosen') plt.xlabel('Number of pulls/plays') plt.ylabel('Percent Optimal Arm') plt.show() #========== Softmax experiments ========== if(0): print('Softmax with different temperatures') #avg_outcome_eps = np.zeros(Npulls) #avg_optimal_arm_eps = np.zeros(Npulls) avg_outcome_softmax0 = np.zeros(Npulls) avg_optimal_arm_softmax0 = np.zeros(Npulls) avg_outcome_softmax1 = np.zeros(Npulls) avg_optimal_arm_softmax1 = np.zeros(Npulls) avg_outcome_softmax2 = np.zeros(Npulls) avg_optimal_arm_softmax2 = np.zeros(Npulls) avg_outcome_softmax3 = np.zeros(Npulls) avg_optimal_arm_softmax3 = np.zeros(Npulls) for i in range(Nexp): # bandit = bndt.Bandit(10) #10 armed bandit # outcome_eps, arms_eps = experiment_epsilonGreedy(bandit,0.0,Npulls) # avg_outcome_eps += outcome_eps # avg_optimal_arm_eps += arms_eps bandit = bndt.Bandit(10) #10 armed bandit outcome_softmax, arms_softmax = experiment_softmax(bandit,0.01,Npulls) avg_outcome_softmax0 += outcome_softmax avg_optimal_arm_softmax0 += arms_softmax bandit = bndt.Bandit(10) #10 armed bandit outcome_softmax, arms_softmax = experiment_softmax(bandit,0.1,Npulls) avg_outcome_softmax1 += outcome_softmax avg_optimal_arm_softmax1 += arms_softmax bandit = bndt.Bandit(10) #10 armed bandit outcome_softmax, arms_softmax = experiment_softmax(bandit,1,Npulls) avg_outcome_softmax2 += outcome_softmax avg_optimal_arm_softmax2 += arms_softmax bandit = bndt.Bandit(10) #10 armed bandit outcome_softmax, arms_softmax = experiment_softmax(bandit,10,Npulls) avg_outcome_softmax3 += outcome_softmax avg_optimal_arm_softmax3 += arms_softmax # avg_outcome_eps /= np.float(Nexp) # avg_optimal_arm_eps /= np.float(Nexp) avg_outcome_softmax0 /= np.float(Nexp) avg_optimal_arm_softmax0 /= np.float(Nexp) avg_outcome_softmax1 /= np.float(Nexp) avg_optimal_arm_softmax1 /= np.float(Nexp) avg_outcome_softmax2 /= np.float(Nexp) avg_optimal_arm_softmax2 /= np.float(Nexp) avg_outcome_softmax3 /= np.float(Nexp) avg_optimal_arm_softmax3 /= np.float(Nexp) # plot results # plt.plot(avg_outcome_eps,label="eps = 0.1") plt.plot(avg_outcome_softmax0,label="temp = 0.01") plt.plot(avg_outcome_softmax1,label="temp = 0.1") plt.plot(avg_outcome_softmax2,label="temp = 1") plt.plot(avg_outcome_softmax3,label="temp = 10") plt.ylim(0,2) plt.legend() plt.title('N-arm bandit problem simulation (N=10) using softmax') plt.ylabel('Average Reward') plt.xlabel('Number of pulls/plays') plt.figure() # plt.plot(avg_optimal_arm_eps*100.0, label='eps = 0.1') plt.plot(avg_optimal_arm_softmax0*100.0, label='temp = 0.01') plt.plot(avg_optimal_arm_softmax1*100.0, label='temp = 0.1') plt.plot(avg_optimal_arm_softmax2*100.0, label='temp = 1') plt.plot(avg_optimal_arm_softmax3*100.0, label='temp = 10') plt.ylim(0,100) plt.legend(loc=0) plt.title('Average Percent Optimal Arm Chosen') plt.xlabel('Number of pulls/plays') plt.ylabel('Percent Optimal Arm') plt.show()
def simulate(method, periods, true_rates, deviation, change, trials, max_p=None, rounding=True, accelerate=True, memory=True, shape='linear', cutoff=28, cut_level=0.5): """ Simulate option choosing and results adding for n periods and a given chooser, return respective successes with optimum and base """ num_options = len(true_rates) rate_changes = [ random.uniform(1 - change, 1 + change) for rate in true_rates ] # Initialize Split or Bandit instances if method == 'split': chooser = spl.Split(num_options=num_options) elif method == 'bandit': chooser = ban.Bandit(num_options=num_options, memory=memory, shape=shape, cutoff=cutoff, cut_level=cut_level) # For each period calculate and add successes for methods as well as # the optimal (max) and the random choice (base) successes = [] max_successes = [] base_successes = [] for period in range(periods): # Calculate success rates under uncertainty (with deviation) rates = [ min( max( np.random.RandomState((i + 1) * (period + 1)).normal( loc=rate * rate_changes[i]**period, scale=rate * rate_changes[i]**period * deviation), 0), 1) for i, rate in enumerate(true_rates) ] # Add results to Split or Bandit if method == 'split': successes.append( add_split_results(trials, max_p, rates, chooser, period, rounding)) elif method == 'bandit': if memory: chooser.add_period() successes.append( add_bandit_results(num_options, trials, rates, chooser, period, rounding, accelerate)) # Add results to max and base successes if period == 0: if rounding: max_successes = [round(trials * max(rates))] base_successes = [ np.sum([ round(trials / num_options * rates[i]) for i in range(num_options) ]) ] else: max_successes = [trials * max(rates)] base_successes = [ np.sum([ trials / num_options * rates[i] for i in range(num_options) ]) ] else: if rounding: max_successes.append(max_successes[-1] + round(trials * max(rates))) base_successes.append(base_successes[-1] + np.sum([ round(trials / num_options * rates[i]) for i in range(num_options) ])) else: max_successes.append(max_successes[-1] + trials * max(rates)) base_successes.append(base_successes[-1] + np.sum([ trials / num_options * rates[i] for i in range(num_options) ])) return [successes, max_successes, base_successes]