def compare_all(runs=2000, time=1000): """Compare all algorithms. """ labels = ['epsilon-greedy', 'gradient bandit', 'UCB', 'optimistic initialization'] generators = [lambda epsilon: Bandit(epsilon=epsilon, sample_averages=True), lambda alpha: Bandit(gradient=True, step_size=alpha, gradient_baseline=True), lambda coef: Bandit(epsilon=0, UCB_param=coef, sample_averages=True), lambda initial: Bandit(epsilon=0, optimistic_init=initial, step_size=0.1)] parameters = [np.arange(-7, -1, dtype=np.float), np.arange(-5, 2, dtype=np.float), np.arange(-4, 3, dtype=np.float), np.arange(-2, 3, dtype=np.float)] bandits = [] for generator, parameter in zip(generators, parameters): for param in parameter: bandits.append(generator(pow(2, param))) _, average_rewards = simulate(bandits, runs, time) rewards = np.mean(average_rewards, axis=1) i = 0 for label, parameter in zip(labels, parameters): l = len(parameter) plt.plot(parameter, rewards[i:i+l], label=label) i += l plt.xlabel('Parameter(2^x)') plt.ylabel('Average reward') plt.legend() plt.savefig(os.path.join(SAVING_PATH, "figure_2_6_compare_all.png")) plt.close()
def run_experiment(m1, m2, m3, eps, N, experiment_name): bandits = [Bandit(m1), Bandit(m2), Bandit(m3)] data = np.empty(N) for i in range(N): # epsilon greedy p = np.random.random() if p < eps: j = np.random.choice(3) else: j = np.argmax([b.mean for b in bandits]) x = bandits[j].pull() bandits[j].update(x) # for the plot data[i] = x cumulative_average = np.cumsum(data) / (np.arange(N) + 1) # # plot moving average ctr # plt.plot(cumulative_average) # plt.plot(np.ones(N) * m1) # plt.plot(np.ones(N) * m2) # plt.plot(np.ones(N) * m3) # plt.xscale('log') # plt.show() for i, b in enumerate(bandits): print(f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}') return cumulative_average
def gradient(runs=2000, time=1000): """Test the k-armed bandit with gradient. Args: runs (int): test each bandit with the # of runs. time (int): the # of the time-steps in each run. """ bandits = [] bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=True, true_reward=4)) bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=False, true_reward=4)) bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=True, true_reward=4)) bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=False, true_reward=4)) print("===== %s =====" % ("Gradient")) best_action_counts, _ = simulate(bandits, runs, time) labels = ['alpha = 0.1, with baseline', 'alpha = 0.1, without baseline', 'alpha = 0.4, with baseline', 'alpha = 0.4, without baseline', ] for i in range(0, len(bandits)): plt.plot(best_action_counts[i], label=labels[i]) plt.xlabel('Steps') plt.ylabel('% Optimal action') plt.legend() plt.savefig(os.path.join(SAVING_PATH, "figure_2_5_gradient.png")) plt.close()
def experiment2(): params = [{ "time_horizon" : 500, "number_of_arms" : 5 }, { "time_horizon" : 5000, "number_of_arms" : 5 }, { "time_horizon" : 500, "number_of_arms" : 10 }, { "time_horizon" : 5000, "number_of_arms" : 10 }, { "time_horizon" : 500, "number_of_arms" : 20 }, { "time_horizon" : 5000, "number_of_arms" : 20 }, ] for i in range(len(params)): results = [] time_horizon = params[i]["time_horizon"] number_of_arms = params[i]["number_of_arms"] agent1 = agentFactory("random",time_horizon,number_of_arms) epsilons = [] for j in range(time_horizon): epsilons.append(math.pow((j+1)*number_of_arms*math.log(j+1),1/3)) agent2 = agentFactory("epsilon-greedy",time_horizon,number_of_arms,epsilons) agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100) agent4 = agentFactory("ucb1",time_horizon,number_of_arms) agent5 = agentFactory("successive-elimination",time_horizon,number_of_arms) bandit = Bandit(time_horizon,number_of_arms,agent1) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,agent2) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,agent3) results.append(mc_simulate(n_sim,bandit,"N=T/100")) bandit = Bandit(time_horizon,number_of_arms,agent4) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,agent5) results.append(mc_simulate(n_sim,bandit)) plot(results,time_horizon,params[i])
def plot_figures(k, n_bandits, n_steps, eps_list, weight_fn=sample_average, random_walk=False, y_bounds=[0, 1.5], Q_1=0, show=True, method='epsilon-greedy', extra_label='', title=None, percentage=False): avg_rew_per_eps = [np.zeros(n_steps) for _ in range(len(eps_list))] avg_rew_in_perc = [np.zeros(n_steps) for _ in range(len(eps_list))] for i in range(n_bandits): print(i) bandit_pb = Bandit(k) for i, eps in enumerate(eps_list): _, per, avg_rew, _ = a_simple_bandit_algorithm( bandit_pb, n_iterations=n_steps, eps=eps, weight_fn=weight_fn, random_walk=random_walk, Q_1=Q_1, method=method) avg_rew_per_eps[i] += avg_rew avg_rew_in_perc[i] += per to_plot = avg_rew_in_perc if percentage else avg_rew_per_eps bounds = [0, 100] if percentage else y_bounds plot_average(to_plot, eps_list, n_bandits, bounds, show, extra_label, title, percentage)
def epsilon_greedy(runs=2000, time=1000): """Test the k-armed bandit with the policy of epsilon greedy. Args: runs (int): test each bandit with the # of runs. time (int): the # of the time-steps in each run. """ epsilons = [0, 0.01, 0.1, 0.2] bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons] print("===== {} =====".format("Epsilon greedy")) best_action_counts, rewards = simulate(bandits, runs, time) plt.figure(figsize=(10, 20)) plt.subplot(2, 1, 1) for eps, rewards in zip(epsilons, rewards): plt.plot(rewards, label="epsilon = %.02f" % (eps)) plt.xlabel('steps') plt.ylabel('average reward') plt.legend() plt.subplot(2, 1, 2) for eps, counts in zip(epsilons, best_action_counts): plt.plot(counts, label="epsilon = %.02f" % (eps)) plt.xlabel('steps') plt.ylabel('% optimal action') plt.legend() plt.savefig(os.path.join(SAVING_PATH, "figure_2_2_epsilon_greedy.png")) plt.close()
def fig_2_5(n_bandits=2000, n_steps=1000, k=10, alpha_list=[0.1, 0.4]): d = {} for baseline in [False, True]: for alpha in alpha_list: d[(baseline, alpha)] = np.zeros(n_steps) for n in range(n_bandits): print(n) bandit = Bandit(k, mean=4) for baseline in [False, True]: for alpha in alpha_list: result_arr, _ = gradient_bandit(bandit, n_steps=n_steps, alpha=alpha, baseline=baseline) d[(baseline, alpha)] += result_arr def label(baseline, alpha): return ("with" if baseline else "without") + f" baseline, alpha={alpha}" for key, avg_rew in d.items(): plt.plot((avg_rew / n_bandits) * 100, label=label(key[0], key[1])) axes = plt.gca() axes.set_ylim([0, 100]) plt.xlabel("Steps") plt.ylabel("Optimal Action %") plt.title("Figure 2.5") plt.legend() plt.show()
def create_bandit(means, variances): def reward_fn(mu, variance): stddev = np.sqrt(variance) return lambda: np.random.normal(mu, stddev) arms = list(map(lambda a: reward_fn(*a), zip(means, variances))) return Bandit(arms)
def run(self, name, agent_name, **agent_parameters): """ :param name: Name of experiments :param agent_name: Name of Agent :param agent_parameters: Parameters of Agent """ rewards = np.zeros(self.pulls) optimal_actions = np.zeros(self.pulls) for _ in tqdm(range(self.experiments)): bandit = Bandit(self.pulls, self.actions, agent_name, **agent_parameters) reward, optimal_action = bandit.experiment() rewards += reward optimal_actions += optimal_action pass rewards /= np.float(self.experiments) optimal_actions /= np.float(self.experiments) self.values[name] = {} self.values[name]['rewards'] = rewards self.values[name]['optimal_actions'] = optimal_actions pass
def run_greedy(number_bandits, epsilon, iterations): bandits = [Bandit(i + 1, 0) for i in range(number_bandits)] current_best = bandits[0] data = np.empty(iterations) print(f'Starting with bandit {current_best.true_mean}.') for i in range(iterations): explore_exploit = np.random.rand() bandit = current_best # explore if explore_exploit < epsilon: selection = np.random.randint(0, number_bandits) #print(f'Machine {selection} selected.') bandit = bandits[selection] # exploit value = bandit.pull() bandit.update(value) data[i] = value #update if current_best.current_mean < bandit.current_mean: print(f'Updated to bandit {bandit.true_mean}') current_best = bandit print(f'Chose bandit {current_best.true_mean}') cumulative_average = np.cumsum(data) / (np.arange(iterations) + 1) plt.plot(cumulative_average) for i in range(number_bandits): plt.plot(np.ones(iterations) * (i + 1)) plt.xscale('log') plt.show() return cumulative_average
def captions_bandit(): data = captions_data def reward_fn(percentage): return lambda: np.random.choice((0, 0.5, 1), 1, p=percentage)[0] arms = list(map(reward_fn, data)) return Bandit(arms)
def bandit(): bandit = Bandit() game.init_bandit(bandit) player = game.get_player() return render_template('bandit.html', credits_demanded=bandit.get_credits_demanded(), fskills=player.get_fighter_skills(), pskills=player.get_pilotskills())
def experiment1(): params = [{ "time_horizon" : 1000, "number_of_arms" : 5 }, { "time_horizon" : 10000, "number_of_arms" : 5 }, { "time_horizon" : 1000, "number_of_arms" : 10 }, { "time_horizon" : 10000, "number_of_arms" : 10 }, { "time_horizon" : 1000, "number_of_arms" : 20 }, { "time_horizon" : 10000, "number_of_arms" : 20 }, ] for i in range(len(params)): results = [] time_horizon = params[i]["time_horizon"] number_of_arms = params[i]["number_of_arms"] agent1 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,5) agent2 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/10) agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100) bandit = Bandit(time_horizon,number_of_arms,agent1) results.append(mc_simulate(n_sim,bandit,"N=5")) bandit = Bandit(time_horizon,number_of_arms,agent2) results.append(mc_simulate(n_sim,bandit,"N=T/10")) bandit = Bandit(time_horizon,number_of_arms,agent3) results.append(mc_simulate(n_sim,bandit,"N=T/100")) plot(results,time_horizon,params[i])
def main(): """ Constants """ k: int = 20 epsilon: float = 0.01 init_val: int = 10 c: float = 1.0 max_time: int = 1000 rounds: int = 1000 policy: Policy = EpsilonGreedyPolicy(epsilon) agent = Agent(k, policy) bandits = [Bandit() for _ in range(k)] def play_round(): """ Simulates one round of the game. """ # get the next action action = agent.choose_action() # get a reward from the bandit reward = bandits[action].get_reward() # play the action agent.play_action(action, reward) return reward def reset(): agent.reset() for bandit in bandits: bandit.reset() optimal_bandit = np.argmax([bandit.get_reward() for bandit in bandits]) def print_bandits(): for i, bandit in enumerate(bandits): print('Bandit {} reward={}'.format(i, bandit.get_reward())) def experiment(): scores = np.zeros(max_time, dtype=float) for _ in range(rounds): for t in range(max_time): scores[t] += play_round() reset() return scores / rounds def plot(label): print_bandits() scores = experiment() time = range(max_time) plt.title(label + " for k = " + str(k)) plt.ylim([0.0, 2.0]) plt.xlabel('Steps') plt.ylabel('Avg. Reward') plt.scatter(x=time, y=scores, s=0.5) plt.show() plot(policy.__str__())
def captions_bandit(n): categories = np.array([0, 0.5, 1]) data = captions_data()[:n] def reward_fn(percentage): return lambda: np.random.choice(categories, 1, p=percentage)[0] arms = list(map(reward_fn, data)) return Bandit(arms)
def poisson_exp_bandit(): n = n_arms() means_ = poisson_exp_means() def reward_fn(lambda_): return lambda: np.random.poisson(lambda_) arms = list(map(reward_fn, means_)) return Bandit(arms)
def sparse_bandit(n_arms, variance): means = sparse_means(n_arms) stddev = np.sqrt(variance) def reward_fn(mu): return lambda: np.random.normal(mu, stddev) arms = list(map(reward_fn, means)) return Bandit(arms)
def polynomial_bandit(n, variance): means = polynomial_means(n) stddev = np.sqrt(variance) def reward_fn(mu): return lambda: np.random.normal(mu, stddev) arms = list(map(reward_fn, means)) return Bandit(arms)
def upper_confidence_bound(runs=2000, time=1000): """Test the k-armed bandit with UCB. Args: runs (int): test each bandit with the # of runs. time (int): the # of the time-steps in each run. """ bandits = [] bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True)) bandits.append(Bandit(epsilon=0.1, sample_averages=True)) _, average_rewards = simulate(bandits, runs, time) plt.plot(average_rewards[0], label='UCB c = 2') plt.plot(average_rewards[1], label='epsilon greedy epsilon = 0.1') plt.xlabel('Steps') plt.ylabel('Average reward') plt.legend() plt.savefig(os.path.join(SAVING_PATH, "figure_2_4_UCB.png")) plt.close()
def get_market_data(): global username if request.method == "GET": band1 = Bandit() trader1 = Trader() police1 = Police() cost1 = Cost() market_inventory = regions.child(currRegion).child('inventory').get() ship_inventory = users.child(username).child('ship').child( 'inventory').get() difficulty = users.child(username).child('difficulty').get() ship_cargo = users.child(username).child('ship').child('cargo').get() credit_val = users.child(username).child('credit').get() ship_health = users.child(username).child('ship').child('health').get() pilot_skill = users.child(username).child('skills').child( 'pilot').get() engineer = users.child(username).child('skills').child( 'engineer').get() fighter_skill = users.child(username).child('skills').child( 'fighter').get() merchant_skill = users.child(username).child('skills').child( 'merchant').get() fuel = users.child(username).child('ship').child('fuel').get() fuelcost = cost1.calculate_fuel(difficulty, credit_val) demand = band1.calculate_demand(difficulty, credit_val) repair = cost1.calculate_repair(difficulty, engineer, credit_val) price = trader1.item_to_sell(difficulty, credit_val) qty = trader1.qty item = trader1.item stolen = police1.stolen_item(ship_inventory) to_return = { 'username': username, 'currRegion': currRegion, 'market_inventory': market_inventory, 'ship_inventory': ship_inventory, 'cargo': ship_cargo, 'credit': credit_val, 'health': ship_health, 'demand': demand, 'qty': qty, 'item': item, 'price': price, 'eng': engineer, 'stolen': stolen, 'difficulty': difficulty, 'pilot': pilot_skill, 'fighter': fighter_skill, 'fuel': fuel, 'fuelcost': fuelcost, 'merch': merchant_skill, 'repair': repair } return to_return return None
def run_experiment(m1, m2, m3, N, experiment_name): bandits = [Bandit(m1), Bandit(m2), Bandit(m3)] data = np.empty(N) for i in range(N): j = np.argmax([ucb(b.mean, i + 1, b.N) for b in bandits]) x = bandits[j].pull() bandits[j].update(x) # for the plot data[i] = x cumulative_average = np.cumsum(data) / (np.arange(N) + 1) for i, b in enumerate(bandits): print( f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}' ) return cumulative_average
def __init__(self, predict_market, num_bids, trials, label='Multi-Armed Prediction Market Bandit'): self.predict_market = predict_market self.n_arms = predict_market.arms self.agents = predict_market.agents self.data = predict_market.dataframe self.num_bids = num_bids self.label = label self.bandit = Bandit(self.n_arms, self.data) self.trials = trials self.scores = None self.optimal = None
def init_bandits(self, holdout=True): """Specify some way to split up the indices?""" df_len = len(self.data_sim.df.index) num_divide = self.n_bandits + 1 if holdout else self.n_bandits increment = int(df_len / num_divide) for i in range(self.n_bandits): indices = [i * increment, (i + 1) * increment] self.bandits.append(Bandit(self.n_arms, self.data_sim.df, data_indices=indices)) if holdout: self.df_holdout = self.data_sim.df[df_len - increment + 1:] self.trials = increment if self.trials is None else self.trials
def createBanditInstancesAndSimulate(params,n_mc_sim): n_sim = n_mc_sim for i in range(len(params)): results = [] time_horizon = params[i]['time_horizon'] number_of_arms = params[i]['number_of_arms'] number_of_exploration_per_arm = params[i]['number_of_exploration_per_arm'] exp_agent = ExploreThenExploit(time_horizon,number_of_arms,number_of_exploration_per_arm) epsilon_greedy_constant_half_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[1/2]*time_horizon) epsilon_greedy_constant_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[number_of_exploration_per_arm*number_of_arms/time_horizon]*time_horizon) ubc_agent = UBC1Agent(time_horizon,number_of_arms) se_agent = SuccessiveEliminationAgent(time_horizon,number_of_arms) random_agent = Agent() bandit = Bandit(time_horizon,number_of_arms,random_agent) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,exp_agent) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_epsilonAgent) results.append(mc_simulate(n_sim,bandit,"constant-epsilon=rate-of-explore-exploit")) bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_half_epsilonAgent) results.append(mc_simulate(n_sim,bandit,"constant-epsilon=0.5")) bandit = Bandit(time_horizon,number_of_arms,ubc_agent) results.append(mc_simulate(n_sim,bandit)) bandit = Bandit(time_horizon,number_of_arms,se_agent) results.append(mc_simulate(n_sim,bandit)) plot(results,time_horizon,params[i])
def __init__(self, models, n, a, task=None): self.models = models self.n = n self.a = a self.num_arms = models.shape[1] self.num_models = models.shape[0] self.counts = np.zeros([self.num_arms]) self.means = np.zeros([self.num_arms]) if task is None: self.task = np.random.choice(self.num_models) else: self.task = task self.bandit = Bandit(self.models[self.task])
def run(algorithm, non_stationary=False): """Runs an algorithm with the specified paramters. algorithm: Instance of an algorithm from `algorithms.py`. non_stationary: Whether to run the stationary or non_stationary test bench. """ with open('config.yml') as cfile: config = y.load(cfile)['run'] runs, steps = config['runs'], config['steps'] avg_rewards, optim_action_percent = np.zeros(steps), np.zeros(steps) for run in range(runs): bandit = Bandit(non_stationary) print(f'Run number {run + 1}.') # One-run rewards or_rewards = [] # One-run actions or_actions = [] optim_action = np.argmax([bandit.q_star(a) for a in range(10)]) # One-run optimal actions or_optim_actions = [] if non_stationary else optim_action for step in range(1, steps + 1): action = algorithm.act(step) reward = bandit(action) if non_stationary: optim_action = np.argmax([bandit.q_star(a) for a in range(10)]) algorithm.update(action, reward, step) or_rewards.append(reward) or_actions.append(action) if non_stationary: or_optim_actions.append(optim_action) avg_rewards += or_rewards if non_stationary: a, o = np.array(or_actions), np.array(or_optim_actions) else: a, o = np.array(or_actions), or_optim_actions optim_action_percent += (a == o) algorithm.reset() avg_rewards /= runs optim_action_percent = (optim_action_percent / runs) * 100.0 return avg_rewards, optim_action_percent
def __init__(self, agent, k=10, stationary=True): print(f'Initialising {k} Bandits') self.k = k self.timestep = 0 self.player = agent self.stationary = stationary if agent.character == 'optimistic': self.values = [np.random.randint(50, 100)] * self.k else: self.values = np.zeros(self.k) mu = [np.random.randint(10) for _ in range(self.k)] sig = [np.random.rand() for _ in range(self.k)] print(f"Initial actual average rewards are {mu}") self.bandits = [ Bandit(mean, sd, stationary) for (mean, sd) in zip(mu, sig) ]
def run_optimistic(number_bandits, iterations): bandits = [Bandit(i + 1, 10) for i in range(number_bandits)] data = np.empty(iterations) for i in range(iterations): bandit = bandits[np.argmax([bandit.current_mean for bandit in bandits])] # exploit value = bandit.pull() bandit.update(value) data[i] = value cumulative_average = np.cumsum(data) / (np.arange(iterations) + 1) plt.plot(cumulative_average) for i in range(number_bandits): plt.plot(np.ones(iterations) * (i + 1)) plt.xscale('log') plt.show() return cumulative_average
def compare_epsilons( epsilons: List[float], bandits_true_means: List[float], iterations: int, ) -> Tuple[List[EpsilonGreedyAgent], List[float]]: """ Compare different epsilons for epsilon-greedy algorithm. """ agents = [] bandits = [Bandit(m) for m in bandits_true_means] for epsilon in epsilons: logger.info("Running epsilon-greedy for epsilon = %f", epsilon) agent = EpsilonGreedyAgent(bandits=bandits, epsilon=epsilon) agent.take_actions(iterations) agents.append(agent) return agents, epsilons
def test_greedy(epsilon, num_iterations): # Problem setup num_bandits = 5 # set number of bandits m_vals = [np.random.randint(30,41) for _ in range(num_bandits)] # create True Mean values bandits = [Bandit(m=i) for i in m_vals] # create bandits data = np.empty(num_iterations) # create an empty array for output data # Epsilon-Greedy algorithm for i in range(num_iterations): probability = np.random.rand() if probability < epsilon: bandit = bandits[np.random.choice(num_bandits)] else: bandit = bandits[np.argmax([bandit.mean for bandit in bandits])] output = bandit.pull() bandit.update(output) data[i] = output # Get cumulative average of all spins cumulative_average = np.cumsum(data) / (np.arange(num_iterations) + 1) return cumulative_average