Esempio n. 1
0
def compare_all(runs=2000, time=1000):
    """Compare all algorithms.
    """
    labels = ['epsilon-greedy', 'gradient bandit',
              'UCB', 'optimistic initialization']
    generators = [lambda epsilon: Bandit(epsilon=epsilon, sample_averages=True),
                  lambda alpha: Bandit(gradient=True, step_size=alpha, gradient_baseline=True),
                  lambda coef: Bandit(epsilon=0, UCB_param=coef, sample_averages=True),
                  lambda initial: Bandit(epsilon=0, optimistic_init=initial, step_size=0.1)]
    parameters = [np.arange(-7, -1, dtype=np.float),
                  np.arange(-5, 2, dtype=np.float),
                  np.arange(-4, 3, dtype=np.float),
                  np.arange(-2, 3, dtype=np.float)]

    bandits = []
    for generator, parameter in zip(generators, parameters):
        for param in parameter:
            bandits.append(generator(pow(2, param)))

    _, average_rewards = simulate(bandits, runs, time)
    rewards = np.mean(average_rewards, axis=1)

    i = 0
    for label, parameter in zip(labels, parameters):
        l = len(parameter)
        plt.plot(parameter, rewards[i:i+l], label=label)
        i += l
    plt.xlabel('Parameter(2^x)')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_6_compare_all.png"))
    plt.close()
def run_experiment(m1, m2, m3, eps, N, experiment_name):
    bandits = [Bandit(m1), Bandit(m2), Bandit(m3)]

    data = np.empty(N)

    for i in range(N):
        # epsilon greedy
        p = np.random.random()
        if p < eps:
            j = np.random.choice(3)
        else:
            j = np.argmax([b.mean for b in bandits])
        x = bandits[j].pull()
        bandits[j].update(x)

        # for the plot
        data[i] = x
    cumulative_average = np.cumsum(data) / (np.arange(N) + 1)

    # # plot moving average ctr
    # plt.plot(cumulative_average)
    # plt.plot(np.ones(N) * m1)
    # plt.plot(np.ones(N) * m2)
    # plt.plot(np.ones(N) * m3)
    # plt.xscale('log')
    # plt.show()

    for i, b in enumerate(bandits):
        print(f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}')

    return cumulative_average
Esempio n. 3
0
def gradient(runs=2000, time=1000):
    """Test the k-armed bandit with gradient.
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    bandits = []
    bandits.append(Bandit(gradient=True, step_size=0.1,
                          gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.1,
                          gradient_baseline=False, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4,
                          gradient_baseline=True, true_reward=4))
    bandits.append(Bandit(gradient=True, step_size=0.4,
                          gradient_baseline=False, true_reward=4))

    print("===== %s =====" % ("Gradient"))
    best_action_counts, _ = simulate(bandits, runs, time)
    labels = ['alpha = 0.1, with baseline',
              'alpha = 0.1, without baseline',
              'alpha = 0.4, with baseline',
              'alpha = 0.4, without baseline', ]

    for i in range(0, len(bandits)):
        plt.plot(best_action_counts[i], label=labels[i])
    plt.xlabel('Steps')
    plt.ylabel('% Optimal action')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_5_gradient.png"))
    plt.close()
Esempio n. 4
0
    def run(self, name, agent_name, **agent_parameters):
        """

        :param name: Name of experiments
        :param agent_name: Name of Agent
        :param agent_parameters: Parameters of Agent
        """
        rewards = np.zeros(self.pulls)
        optimal_actions = np.zeros(self.pulls)

        for _ in tqdm(range(self.experiments)):
            bandit = Bandit(self.pulls, self.actions, agent_name,
                            **agent_parameters)
            reward, optimal_action = bandit.experiment()
            rewards += reward
            optimal_actions += optimal_action

            pass

        rewards /= np.float(self.experiments)
        optimal_actions /= np.float(self.experiments)

        self.values[name] = {}
        self.values[name]['rewards'] = rewards
        self.values[name]['optimal_actions'] = optimal_actions

        pass
Esempio n. 5
0
def bandit():
    bandit = Bandit()
    game.init_bandit(bandit)
    player = game.get_player()
    return render_template('bandit.html',
                           credits_demanded=bandit.get_credits_demanded(),
                           fskills=player.get_fighter_skills(),
                           pskills=player.get_pilotskills())
Esempio n. 6
0
def experiment2():
    params = [{
        "time_horizon" : 500,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 500,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 500,
        "number_of_arms" : 20
    },
    {
        "time_horizon" : 5000,
        "number_of_arms" : 20
    },
    ]
    
    for i in range(len(params)):
        results = []
        time_horizon = params[i]["time_horizon"]
        number_of_arms = params[i]["number_of_arms"]
        agent1 = agentFactory("random",time_horizon,number_of_arms)
        
        epsilons = []
        for j in range(time_horizon):
            epsilons.append(math.pow((j+1)*number_of_arms*math.log(j+1),1/3))
        
        agent2 = agentFactory("epsilon-greedy",time_horizon,number_of_arms,epsilons)
        agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100)
        agent4 = agentFactory("ucb1",time_horizon,number_of_arms)
        agent5 = agentFactory("successive-elimination",time_horizon,number_of_arms)

        bandit = Bandit(time_horizon,number_of_arms,agent1)
        results.append(mc_simulate(n_sim,bandit))
 
        bandit = Bandit(time_horizon,number_of_arms,agent2)
        results.append(mc_simulate(n_sim,bandit))
        
        bandit = Bandit(time_horizon,number_of_arms,agent3)
        results.append(mc_simulate(n_sim,bandit,"N=T/100"))

        bandit = Bandit(time_horizon,number_of_arms,agent4)
        results.append(mc_simulate(n_sim,bandit))
        
        bandit = Bandit(time_horizon,number_of_arms,agent5)
        results.append(mc_simulate(n_sim,bandit))

        plot(results,time_horizon,params[i])
Esempio n. 7
0
def get_market_data():
    global username
    if request.method == "GET":
        band1 = Bandit()
        trader1 = Trader()
        police1 = Police()
        cost1 = Cost()

        market_inventory = regions.child(currRegion).child('inventory').get()
        ship_inventory = users.child(username).child('ship').child(
            'inventory').get()
        difficulty = users.child(username).child('difficulty').get()
        ship_cargo = users.child(username).child('ship').child('cargo').get()
        credit_val = users.child(username).child('credit').get()
        ship_health = users.child(username).child('ship').child('health').get()
        pilot_skill = users.child(username).child('skills').child(
            'pilot').get()
        engineer = users.child(username).child('skills').child(
            'engineer').get()
        fighter_skill = users.child(username).child('skills').child(
            'fighter').get()
        merchant_skill = users.child(username).child('skills').child(
            'merchant').get()
        fuel = users.child(username).child('ship').child('fuel').get()
        fuelcost = cost1.calculate_fuel(difficulty, credit_val)
        demand = band1.calculate_demand(difficulty, credit_val)
        repair = cost1.calculate_repair(difficulty, engineer, credit_val)
        price = trader1.item_to_sell(difficulty, credit_val)
        qty = trader1.qty
        item = trader1.item
        stolen = police1.stolen_item(ship_inventory)
        to_return = {
            'username': username,
            'currRegion': currRegion,
            'market_inventory': market_inventory,
            'ship_inventory': ship_inventory,
            'cargo': ship_cargo,
            'credit': credit_val,
            'health': ship_health,
            'demand': demand,
            'qty': qty,
            'item': item,
            'price': price,
            'eng': engineer,
            'stolen': stolen,
            'difficulty': difficulty,
            'pilot': pilot_skill,
            'fighter': fighter_skill,
            'fuel': fuel,
            'fuelcost': fuelcost,
            'merch': merchant_skill,
            'repair': repair
        }
        return to_return
    return None
Esempio n. 8
0
 def __init__(self, predict_market, num_bids, trials,
              label='Multi-Armed Prediction Market Bandit'):
     self.predict_market = predict_market
     self.n_arms = predict_market.arms
     self.agents = predict_market.agents
     self.data = predict_market.dataframe
     self.num_bids = num_bids
     self.label = label
     self.bandit = Bandit(self.n_arms, self.data)
     self.trials = trials
     self.scores = None
     self.optimal = None
 def __init__(self, models, n, a, task=None):
     self.models = models
     self.n = n
     self.a = a
     self.num_arms = models.shape[1]
     self.num_models = models.shape[0]
     self.counts = np.zeros([self.num_arms])
     self.means = np.zeros([self.num_arms])
     if task is None:
         self.task = np.random.choice(self.num_models)
     else:
         self.task = task
     self.bandit = Bandit(self.models[self.task])
Esempio n. 10
0
def run(algorithm, non_stationary=False):
    """Runs an algorithm with the specified paramters.

    algorithm: Instance of an algorithm from `algorithms.py`.
    non_stationary: Whether to run the stationary or non_stationary test bench.
    """
    with open('config.yml') as cfile: 
        config = y.load(cfile)['run']
    runs, steps = config['runs'], config['steps']

    avg_rewards, optim_action_percent = np.zeros(steps), np.zeros(steps)

    for run in range(runs):
        bandit = Bandit(non_stationary)
        print(f'Run number {run + 1}.')

        # One-run rewards
        or_rewards = []
        # One-run actions
        or_actions = []
        optim_action = np.argmax([bandit.q_star(a) for a in range(10)])
        # One-run optimal actions
        or_optim_actions = [] if non_stationary else optim_action

        for step in range(1, steps + 1):
            action = algorithm.act(step)
            reward = bandit(action)
            if non_stationary:
                optim_action = np.argmax([bandit.q_star(a) for a in range(10)])
            algorithm.update(action, reward, step)

            or_rewards.append(reward)
            or_actions.append(action)
            if non_stationary:
                or_optim_actions.append(optim_action)
        
        avg_rewards += or_rewards

        if non_stationary:
            a, o = np.array(or_actions), np.array(or_optim_actions)
        else:
            a, o = np.array(or_actions), or_optim_actions
        optim_action_percent += (a == o)
        
        algorithm.reset()
    
    avg_rewards /= runs
    optim_action_percent = (optim_action_percent / runs) * 100.0

    return avg_rewards, optim_action_percent
Esempio n. 11
0
def create_bandit(means, variances):
    def reward_fn(mu, variance):
        stddev = np.sqrt(variance)
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(lambda a: reward_fn(*a), zip(means, variances)))
    return Bandit(arms)
Esempio n. 12
0
def epsilon_greedy(runs=2000, time=1000):
    """Test the k-armed bandit with the policy of epsilon greedy. 
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    epsilons = [0, 0.01, 0.1, 0.2]
    bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]

    print("===== {} =====".format("Epsilon greedy"))
    best_action_counts, rewards = simulate(bandits, runs, time)

    plt.figure(figsize=(10, 20))
    plt.subplot(2, 1, 1)
    for eps, rewards in zip(epsilons, rewards):
        plt.plot(rewards, label="epsilon = %.02f" % (eps))
    plt.xlabel('steps')
    plt.ylabel('average reward')
    plt.legend()

    plt.subplot(2, 1, 2)
    for eps, counts in zip(epsilons, best_action_counts):
        plt.plot(counts, label="epsilon = %.02f" % (eps))
    plt.xlabel('steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_2_epsilon_greedy.png"))
    plt.close()
Esempio n. 13
0
def run_greedy(number_bandits, epsilon, iterations):
    bandits = [Bandit(i + 1, 0) for i in range(number_bandits)]
    current_best = bandits[0]
    data = np.empty(iterations)
    print(f'Starting with bandit {current_best.true_mean}.')
    for i in range(iterations):
        explore_exploit = np.random.rand()
        bandit = current_best
        # explore
        if explore_exploit < epsilon:
            selection = np.random.randint(0, number_bandits)
            #print(f'Machine {selection} selected.')
            bandit = bandits[selection]
        # exploit
        value = bandit.pull()
        bandit.update(value)
        data[i] = value
        #update
        if current_best.current_mean < bandit.current_mean:
            print(f'Updated to bandit {bandit.true_mean}')
            current_best = bandit

    print(f'Chose bandit {current_best.true_mean}')
    cumulative_average = np.cumsum(data) / (np.arange(iterations) + 1)
    plt.plot(cumulative_average)
    for i in range(number_bandits):
        plt.plot(np.ones(iterations) * (i + 1))
    plt.xscale('log')
    plt.show()
    return cumulative_average
Esempio n. 14
0
def plot_figures(k,
                 n_bandits,
                 n_steps,
                 eps_list,
                 weight_fn=sample_average,
                 random_walk=False,
                 y_bounds=[0, 1.5],
                 Q_1=0,
                 show=True,
                 method='epsilon-greedy',
                 extra_label='',
                 title=None,
                 percentage=False):
    avg_rew_per_eps = [np.zeros(n_steps) for _ in range(len(eps_list))]
    avg_rew_in_perc = [np.zeros(n_steps) for _ in range(len(eps_list))]
    for i in range(n_bandits):
        print(i)
        bandit_pb = Bandit(k)
        for i, eps in enumerate(eps_list):
            _, per, avg_rew, _ = a_simple_bandit_algorithm(
                bandit_pb,
                n_iterations=n_steps,
                eps=eps,
                weight_fn=weight_fn,
                random_walk=random_walk,
                Q_1=Q_1,
                method=method)
            avg_rew_per_eps[i] += avg_rew
            avg_rew_in_perc[i] += per

    to_plot = avg_rew_in_perc if percentage else avg_rew_per_eps
    bounds = [0, 100] if percentage else y_bounds
    plot_average(to_plot, eps_list, n_bandits, bounds, show, extra_label,
                 title, percentage)
Esempio n. 15
0
def fig_2_5(n_bandits=2000, n_steps=1000, k=10, alpha_list=[0.1, 0.4]):
  d = {}
  for baseline in [False, True]:
    for alpha in alpha_list:
      d[(baseline, alpha)] = np.zeros(n_steps)
  for n in range(n_bandits):
    print(n)
    bandit = Bandit(k, mean=4)
    for baseline in [False, True]:
      for alpha in alpha_list:
        result_arr, _ = gradient_bandit(bandit, n_steps=n_steps,
                                                alpha=alpha, baseline=baseline)
        d[(baseline, alpha)] += result_arr

  def label(baseline, alpha):
    return ("with" if baseline else "without") + f" baseline, alpha={alpha}"
  for key, avg_rew in d.items():
    plt.plot((avg_rew / n_bandits) * 100, label=label(key[0], key[1]))
  axes = plt.gca()
  axes.set_ylim([0, 100])
  plt.xlabel("Steps")
  plt.ylabel("Optimal Action %")
  plt.title("Figure 2.5")
  plt.legend()
  plt.show()
Esempio n. 16
0
    def step(self, bandit : Bandit, epsilon= 0):
        '''
        action selection through greedy method
        :param bandit:
        :return:
        '''

        #flip coin to decide a greedy action or random
        flip_coin = self.rng.rand()

        if flip_coin < epsilon:
            #random action
            chosen_lever = self.rng.randint(self.n_arms)

        else:
            # find all the best levers and then sample randomly from them
            reward = max(self.Q)
            best_levers = [lever for lever in range(self.n_arms) if self.Q[lever] == reward]
            chosen_lever = best_levers[self.rng.randint(len(best_levers))]

        reward = bandit.pull(chosen_lever)

        self.N[chosen_lever] += 1
        #Updated estimated expected reward for the best lever
        self.Q[chosen_lever] = self.Q[chosen_lever] + (1/self.N[chosen_lever])*(reward - self.Q[chosen_lever])

        return chosen_lever, reward
Esempio n. 17
0
def captions_bandit():
    data = captions_data

    def reward_fn(percentage):
        return lambda: np.random.choice((0, 0.5, 1), 1, p=percentage)[0]

    arms = list(map(reward_fn, data))
    return Bandit(arms)
Esempio n. 18
0
def experiment1():
    params = [{
        "time_horizon" : 1000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 5
    },
    {
        "time_horizon" : 1000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 10
    },
    {
        "time_horizon" : 1000,
        "number_of_arms" : 20
    },
    {
        "time_horizon" : 10000,
        "number_of_arms" : 20
    },
    ]
    
    for i in range(len(params)):
        results = []
        time_horizon = params[i]["time_horizon"]
        number_of_arms = params[i]["number_of_arms"]
        agent1 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,5)
        agent2 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/10)
        agent3 = agentFactory("explore-then-exploit",time_horizon,number_of_arms,time_horizon/100)
        
        bandit = Bandit(time_horizon,number_of_arms,agent1)
        results.append(mc_simulate(n_sim,bandit,"N=5"))
 
        bandit = Bandit(time_horizon,number_of_arms,agent2)
        results.append(mc_simulate(n_sim,bandit,"N=T/10"))
        
        bandit = Bandit(time_horizon,number_of_arms,agent3)
        results.append(mc_simulate(n_sim,bandit,"N=T/100"))

        plot(results,time_horizon,params[i])
Esempio n. 19
0
def polynomial_bandit(n, variance):
    means = polynomial_means(n)
    stddev = np.sqrt(variance)

    def reward_fn(mu):
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(reward_fn, means))
    return Bandit(arms)
def main():
    """
    Constants
    """
    k: int = 20
    epsilon: float = 0.01
    init_val: int = 10
    c: float = 1.0
    max_time: int = 1000
    rounds: int = 1000
    policy: Policy = EpsilonGreedyPolicy(epsilon)
    agent = Agent(k, policy)
    bandits = [Bandit() for _ in range(k)]

    def play_round():
        """
        Simulates one round of the game.
        """
        # get the next action
        action = agent.choose_action()
        # get a reward from the bandit
        reward = bandits[action].get_reward()
        # play the action
        agent.play_action(action, reward)
        return reward

    def reset():
        agent.reset()
        for bandit in bandits:
            bandit.reset()
        optimal_bandit = np.argmax([bandit.get_reward() for bandit in bandits])

    def print_bandits():
        for i, bandit in enumerate(bandits):
            print('Bandit {} reward={}'.format(i, bandit.get_reward()))

    def experiment():
        scores = np.zeros(max_time, dtype=float)
        for _ in range(rounds):
            for t in range(max_time):
                scores[t] += play_round()
            reset()

        return scores / rounds

    def plot(label):
        print_bandits()
        scores = experiment()
        time = range(max_time)
        plt.title(label + " for k = " + str(k))
        plt.ylim([0.0, 2.0])
        plt.xlabel('Steps')
        plt.ylabel('Avg. Reward')
        plt.scatter(x=time, y=scores, s=0.5)
        plt.show()

    plot(policy.__str__())
def sparse_bandit(n_arms, variance):
    means = sparse_means(n_arms)
    stddev = np.sqrt(variance)

    def reward_fn(mu):
        return lambda: np.random.normal(mu, stddev)

    arms = list(map(reward_fn, means))
    return Bandit(arms)
Esempio n. 22
0
def poisson_exp_bandit():
    n = n_arms()
    means_ = poisson_exp_means()

    def reward_fn(lambda_):
        return lambda: np.random.poisson(lambda_)

    arms = list(map(reward_fn, means_))
    return Bandit(arms)
Esempio n. 23
0
def captions_bandit(n):
    categories = np.array([0, 0.5, 1])
    data = captions_data()[:n]

    def reward_fn(percentage):
        return lambda: np.random.choice(categories, 1, p=percentage)[0]

    arms = list(map(reward_fn, data))
    return Bandit(arms)
Esempio n. 24
0
class UCB():
    def __init__(self, models, n, alpha_ucb, task=None):
        self.models = models
        self.n = n
        self.alpha = alpha_ucb
        self.num_arms = models.shape[1]
        self.num_models = models.shape[0]
        self.counts = np.zeros([self.num_arms])
        self.means = np.zeros([self.num_arms])
        if task is None:
            self.task = np.random.choice(self.num_models)
        else:
            self.task = task
        self.bandit = Bandit(self.models[self.task])

    def select_arm_ucb(self, t):
        for a in range(self.num_arms):
            if self.counts[a] == 0:
                return a
        ucb_values = self.means + np.sqrt(
            (self.alpha * np.log(t)) / self.counts)
        return np.argmax(ucb_values)

    def update_arm(self, action, reward):
        self.counts[action] += 1

        n = self.counts[action]
        value = self.means[action]
        self.means[action] = ((n - 1) / n) * value + (1. / n) * reward

    def run(self, T_list):
        regrets = []
        counts = np.zeros([max(T_list), self.num_arms])
        for t in range(self.n):
            action = self.select_arm_ucb(t)
            reward = self.bandit.pull_arm(action)
            self.update_arm(action, reward)
            if (t + 1) in T_list:
                regret = self.bandit.calculate_regret(
                    self.counts)  # keep track of regrets
                regrets.append(regret)
            counts[t, :] = self.counts

        return regrets, np.asarray(counts)
Esempio n. 25
0
def upper_confidence_bound(runs=2000, time=1000):
    """Test the k-armed bandit with UCB.
    Args:
        runs (int): test each bandit with the # of runs.
        time (int): the # of the time-steps in each run.
    """
    bandits = []
    bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
    bandits.append(Bandit(epsilon=0.1, sample_averages=True))
    _, average_rewards = simulate(bandits, runs, time)

    plt.plot(average_rewards[0], label='UCB c = 2')
    plt.plot(average_rewards[1], label='epsilon greedy epsilon = 0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig(os.path.join(SAVING_PATH, "figure_2_4_UCB.png"))
    plt.close()
Esempio n. 26
0
def run_experiment(m1, m2, m3, N, experiment_name):
    bandits = [Bandit(m1), Bandit(m2), Bandit(m3)]

    data = np.empty(N)

    for i in range(N):
        j = np.argmax([ucb(b.mean, i + 1, b.N) for b in bandits])
        x = bandits[j].pull()
        bandits[j].update(x)

        # for the plot
        data[i] = x
    cumulative_average = np.cumsum(data) / (np.arange(N) + 1)

    for i, b in enumerate(bandits):
        print(
            f'{experiment_name}, bandit {i + 1} mean: {b.mean} win: {np.sum(data)}'
        )

    return cumulative_average
Esempio n. 27
0
 def init_bandits(self, holdout=True):
     """Specify some way to split up the indices?"""
     df_len = len(self.data_sim.df.index)
     num_divide = self.n_bandits + 1 if holdout else self.n_bandits
     increment = int(df_len / num_divide)
     for i in range(self.n_bandits):
         indices = [i * increment, (i + 1) * increment]
         self.bandits.append(Bandit(self.n_arms, self.data_sim.df,
                                    data_indices=indices))
     if holdout:
         self.df_holdout = self.data_sim.df[df_len - increment + 1:]
     self.trials = increment if self.trials is None else self.trials
Esempio n. 28
0
def createBanditInstancesAndSimulate(params,n_mc_sim):
    n_sim = n_mc_sim
    for i in range(len(params)):
        results = []
        time_horizon = params[i]['time_horizon']
        number_of_arms = params[i]['number_of_arms']
        number_of_exploration_per_arm = params[i]['number_of_exploration_per_arm']

        exp_agent = ExploreThenExploit(time_horizon,number_of_arms,number_of_exploration_per_arm)
        epsilon_greedy_constant_half_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[1/2]*time_horizon)
        epsilon_greedy_constant_epsilonAgent = EpsilonGreedy(time_horizon,number_of_arms,[number_of_exploration_per_arm*number_of_arms/time_horizon]*time_horizon)
        ubc_agent = UBC1Agent(time_horizon,number_of_arms)
        se_agent = SuccessiveEliminationAgent(time_horizon,number_of_arms) 
        random_agent = Agent()
        
        bandit = Bandit(time_horizon,number_of_arms,random_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,exp_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_epsilonAgent)
        results.append(mc_simulate(n_sim,bandit,"constant-epsilon=rate-of-explore-exploit"))

        bandit = Bandit(time_horizon,number_of_arms,epsilon_greedy_constant_half_epsilonAgent)
        results.append(mc_simulate(n_sim,bandit,"constant-epsilon=0.5"))

        bandit = Bandit(time_horizon,number_of_arms,ubc_agent)
        results.append(mc_simulate(n_sim,bandit))

        bandit = Bandit(time_horizon,number_of_arms,se_agent)
        results.append(mc_simulate(n_sim,bandit))

        plot(results,time_horizon,params[i])
Esempio n. 29
0
class Training(TrainingBase):
    def __init__(self):
        super(Training, self).__init__()
        self.steps = options.get('environment/steps', 1000)
        self.bandit = Bandit()

    def episode(self, number):
        # get first action from agent
        action = self.agent.update(reward=None, state=[])
        # update agent with state and reward
        for step in range(self.steps):
            reward = self.bandit.pull(action)
            action = self.agent.update(reward=reward, state=[])
            log.info('step: %s, action: %s' % (step, action))
Esempio n. 30
0
def param_study(n_bandits=2000,
                n_steps=1000,
                title='Figure 2.6',
                fn='fig2_6',
                nonstat=False,
                print_freq=10,
                start_timestep=np.inf):
    results = {(method, hyper): 0
               for (method, hyperparams) in HYPERPARMS.items()
               for hyper in hyperparams}
    y_label = (f"Average Reward over last {n_steps-start_timestep} steps"
               if nonstat else f"Average Reward over first {n_steps} steps")
    for t in range(1, n_bandits + 1):
        print(f"{t}/{n_bandits}")
        bandit = Bandit()
        for method, hyperparams in HYPERPARMS.items():
            for hyper in hyperparams:
                results[(method,
                         hyper)] += apply_method(bandit, n_steps, method,
                                                 hyper, nonstat,
                                                 start_timestep)[-1]
                bandit.reset()  # need to reset q values after random walk
        if (t % print_freq == 0):
            plot_current(n_steps, results, t, title, fn, y_label)
Esempio n. 31
0
def BanditExperiment(debug=True,
                     replications=100, # Posen & Levinthal used 25,000
                     arms=10,
                     turns=500,
                     payoff_fxn=[betadist_payoff],
                     turbulence_fxn=[randomshock],
                     strategy_fxn=[softmax_strategy],
                     turbulence=[0],
                     belief_fxn=[belief_with_latency_and_memory],
                     strategy=[0.5],
                     latency=[0],
                     initial_learning=0,
                     memory=[500],
                     experiment_name=""
                     ):
    """
    This function should set up a set of Bandit simulations with given
    parameters and run them as an experiment.  Its output will be a table
    of data, each row containing the variable inputs and the outcomes
    (averaged over replications within each experimental condition).
    
    Arguments except 'debug', 'experiment_name', 'replications', 'arms', and 
    'turns' are lists of values to experiment with.  For each combination
    of values, the simulation is repeated 'replications' times.
    So if you want to test the difference between turbulence of 0 and 0.1, 
    set turbulence=[0,0.1] and the experiment will be set up.
    """

    programstart = datetime.datetime.now()
    
    _experiment_name = experiment_name if experiment_name else programstart.strftime('%Y%m%d-%H%M%S')
    _logfile = open(("output/"+_experiment_name+"-log.txt"), 'w')
    _datafile = open(("output/"+_experiment_name+"-data.csv"), 'w')
    _datafile.write("EXPERIMENT,REPLICATION,PAYOFF_FXN,TURBULENCE_FXN,STRATEGY_FXN,BELIEF_FXN,TURBULENCE,STRATEGY,LATENCY,MEMORY,SCORE,KNOWLEDGE,OPINION,PROBEXPLORE\n") # CSV header row
    _summaryfile = open(("output/"+_experiment_name+"-summary.csv"), 'w')
    _summaryfile.write("EXPERIMENT,PAYOFF_FXN,TURBULENCE_FXN,STRATEGY_FXN,BELIEF_FXN,TURBULENCE,STRATEGY,LATENCY,MEMORY,MEAN_SCORE,MEAN_KNOWLEDGE,MEAN_OPINION,MEAN_PROBEXPLORE\n") # CSV header row

    def log(message):
        _logfile.write(message)
        if debug:
            print(message)  #only print to screen if user wants it wordy for debugging purposes
        
    _numexps = len(payoff_fxn)*len(turbulence_fxn)*len(strategy_fxn)*len(turbulence)*len(belief_fxn)*len(strategy)*len(latency)*len(memory)
    log("Planning "+str(_numexps)+" experiments with "+str(replications)+
             " replications x "+str(turns)+" turns each.\nI.e., a total of "+
             str(_numexps*replications*turns)+" turns of processing.\n\n")
    _currentexp = 0  # which experiment are we on currently?
    

    # Loop through all experimental conditions and run simulations:
    
    for pf in payoff_fxn:
        for tf in turbulence_fxn:
            for sf in strategy_fxn:
                for tb in turbulence:
                    for bf in belief_fxn:
                        for st in strategy:
                            for lt in latency:
                                for mm in memory:
                            
                                    # Run several replications of the simulation within one experimental condition:

                                    # hold the data from each replication (to be averaged later)
                                    finalscores = []
                                    finalknowledges = []
                                    finalopinions = []
                                    finalprobexplores = []

                                    _currentexp += 1

                                    log("Starting experiment " + str(_currentexp) +
                                        " of " + str(_numexps) +
                                        " with:\n payoff_fxn="+str(pf)+
                                        "\n turbulence_fxn="+str(tf)+
                                        "\n strategy_fxn="+str(sf)+
                                        "\n belief_fxn="+str(bf)+
                                        "\n turbulence="+str(tb)+
                                        "\n strategy="+str(st)+
                                        "\n latency="+str(lt)+
                                        "\n memory="+str(mm)+"\n")
                                    expstart = datetime.datetime.now()

                                    for i in range(replications):

                                        # Do one replication (of many) within an experimental condition:

                                        b = Bandit( arms=arms, turns=turns, payoff_fxn=pf, turbulence_fxn=tf, strategy_fxn=sf, turbulence=tb, belief_fxn=bf, strategy=st, latency=lt, initial_learning=initial_learning, memory=mm)
                                        b.simulate()
                                        finalscores.append(b.score())
                                        finalknowledges.append(b.knowledge())
                                        finalopinions.append(b.opinion())
                                        finalprobexplores.append(b.probexplore())
                                        # log the data
                                        _datafile.write('{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(
                                                _currentexp, # experiment number
                                                (i+1), # replication number
                                                str(pf),
                                                str(tf),
                                                str(sf),
                                                str(bf),
                                                str(tb),
                                                str(st),
                                                str(lt),
                                                str(mm),
                                                b.score(),
                                                b.knowledge(),
                                                b.opinion(),
                                                b.probexplore()
                                                ))
                                        #log("simulation "+str(i+1)+" of "+str(replications)+" took "+str(b._simtime))

                                    # Take average results from all replications (within one experimental condition)
                                    # and output them to a 'summary' data file.

                                    _summaryfile.write('{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(
                                        _currentexp,
                                        str(pf),
                                        str(tf),
                                        str(sf),
                                        str(bf),
                                        str(tb),
                                        str(st),
                                        str(lt),
                                        str(mm),
                                        sum(finalscores)/replications,
                                        sum(finalknowledges)/replications,
                                        sum(finalopinions)/replications,
                                        sum(finalprobexplores)/replications
                                        ))

                                    log("FINISHED in "+str(datetime.datetime.now()-expstart)+"\n\n")

                                    # Loop goes to the next experimental condition.




    log("All experiments completed in " + str(datetime.datetime.now() - programstart))
    _logfile.close()
    _datafile.close()
    _summaryfile.close()
Esempio n. 32
0
def main():
    bandit = Bandit()
    bandit.setupBandit(10)

    for i in range(1000):
        greedyPlay(bandit, 500)
Esempio n. 33
0
    def run(self):
        programstart = datetime.datetime.now()
        _experiment_name = self.experiment_name if self.experiment_name else programstart.strftime('%Y%m%d-%H%M%S')
        _logfile = open(("output/"+_experiment_name+"-log.txt"), 'w', buffering=1)
        _datafile = open(("output/"+_experiment_name+"-data.csv"), 'w', buffering=1)
        _datafile.write("EXPERIMENT,REPLICATION,ARMS,TURNS,PAYOFF_FXN,TURBULENCE_FXN,STRATEGY_FXN,BELIEF_FXN,TURBULENCE,STRATEGY,LATENCY,INITIAL_LEARNING,MEMORY,SCORE,KNOWLEDGE,OPINION,PROBEXPLORE\n") # CSV header row
        _summaryfile = open(("output/"+_experiment_name+"-summary.csv"), 'w', buffering=1)
        _summaryfile.write("EXPERIMENT,ARMS,TURNS,PAYOFF_FXN,TURBULENCE_FXN,STRATEGY_FXN,BELIEF_FXN,TURBULENCE,STRATEGY,LATENCY,INITIAL_LEARNING,MEMORY,MEAN_SCORE,MEAN_KNOWLEDGE,MEAN_OPINION,MEAN_PROBEXPLORE\n") # CSV header row
        if self.timeseries: _timeseriesfile = open(("output/"+_experiment_name+"-timeseries.csv"), 'w', buffering=1)


        def log(message):
          _logfile.write(message)
          if self.debug:
              print(message)  #only print to screen if user wants it wordy for debugging purposes
        
        _numexps_without_turns = len(self.arms)*len(self.payoff_fxn)*len(self.turbulence_fxn)*len(self.strategy_fxn)*len(self.turbulence)*len(self.belief_fxn)*len(self.strategy)*len(self.latency)*len(self.memory)
        _numturns = sum([t*_numexps_without_turns for t in self.turns])
        _numexps = _numexps_without_turns*len(self.turns)

        log("Planning "+str(_numexps)+" experiments with "+str(self.replications)+
               " replications x "+" or ".join([str(t) for t in self.turns])+" turns each.\nI.e., a total of "+
               str(self.replications*_numturns)+" turns of processing.\n\n")
        _currentexp = 0  # which experiment are we on currently?
    

      # Loop through all experimental conditions and run simulations:
    
        for ar in self.arms:
            for tu in self.turns:
                #for il in self.initial_learning:
                    for pf in self.payoff_fxn:
                        for tf in self.turbulence_fxn:
                            for sf in self.strategy_fxn:
                                for tb in self.turbulence:
                                    for bf in self.belief_fxn:
                                        for st in self.strategy:
                                            for lt in self.latency:
                                                for mm in self.memory:

                                                    # Run several replications of the simulation within one experimental condition:

                                                    # hold the data from each replication (to be averaged later)
                                                    finalscores = []
                                                    finalknowledges = []
                                                    finalopinions = []
                                                    finalprobexplores = []

                                                    _currentexp += 1

                                                    il = lt #this should guarantee that latency conditions don't wait until turn lt+1 to start learning; they have some initial learning, it's just out of date

                                                    log("Starting experiment " + str(_currentexp) +
                                                        " of " + str(_numexps) +
                                                        " with:"+
                                                        "\n arms="+str(ar)+
                                                        "\n turns="+str(tu)+
                                                        "\n payoff_fxn="+str(pf)+
                                                        "\n turbulence_fxn="+str(tf)+
                                                        "\n strategy_fxn="+str(sf)+
                                                        "\n belief_fxn="+str(bf)+
                                                        "\n turbulence="+str(tb)+
                                                        "\n strategy="+str(st)+
                                                        "\n latency="+str(lt)+
                                                        "\n initial_learning="+str(il)+
                                                        "\n memory="+str(mm)+"\n")
                                                    expstart = datetime.datetime.now()

                                                    for i in range(self.replications):

                                                        # Do one replication (of many) within an experimental condition:

                                                        b = Bandit( arms=ar, turns=tu, payoff_fxn=pf, turbulence_fxn=tf, strategy_fxn=sf, turbulence=tb, belief_fxn=bf, strategy=st, latency=lt, initial_learning=il, memory=mm)
                                                        b.simulate()
                                                        finalscores.append(b.score())
                                                        finalknowledges.append(b.knowledge())
                                                        finalopinions.append(b.opinion())
                                                        finalprobexplores.append(b.probexplore())
                                                        # log the data
                                                        _datafile.write('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(
                                                                _currentexp, # experiment number
                                                                (i+1), # replication number
                                                                str(ar),
                                                                str(tu),
                                                                str(pf),
                                                                str(tf),
                                                                str(sf),
                                                                str(bf),
                                                                str(tb),
                                                                str(st),
                                                                str(lt),
                                                                str(il),
                                                                str(mm),
                                                                b.score(),
                                                                b.knowledge(),
                                                                b.opinion(),
                                                                b.probexplore()
                                                                ))
                                                        #log("simulation "+str(i+1)+" of "+str(replications)+" took "+str(b._simtime))
                                                        if self.timeseries:
                                                            _timeseriesfile.write(','.join([str(s) for s in b.allscores()])+'\n')

                                                    # Take average results from all replications (within one experimental condition)
                                                    # and output them to a 'summary' data file.

                                                    _summaryfile.write('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(
                                                        _currentexp,
                                                        str(ar),
                                                        str(tu),
                                                        str(pf),
                                                        str(tf),
                                                        str(sf),
                                                        str(bf),
                                                        str(tb),
                                                        str(st),
                                                        str(lt),
                                                        str(il),
                                                        str(mm),
                                                        sum(finalscores)/self.replications,
                                                        sum(finalknowledges)/self.replications,
                                                        sum(finalopinions)/self.replications,
                                                        sum(finalprobexplores)/self.replications
                                                        ))

                                                    log("FINISHED in "+str(datetime.datetime.now()-expstart)+"\n\n")

                                                    # Loop goes to the next experimental condition.




        log("All experiments completed in " + str(datetime.datetime.now() - programstart))
        _logfile.close()
        _datafile.close()
        _summaryfile.close()