Exemple #1
0
    def __init__(self):
        self.payoutModifier1 = 1.0
        self.payoutModifier2 = 2.0
        self.payoutModifier3 = 3.0
        self.iterations = 10000

        self.means = [10, 10, 10]

        self.bandits = [
            bandit.Bandit(self.payoutModifier1),
            bandit.Bandit(self.payoutModifier2),
            bandit.Bandit(self.payoutModifier3)
        ]
        self.data = np.empty(self.iterations)
Exemple #2
0
 def __init__(self, net, n_envs, n_bandits, bandit_prob, bootstrap=True):
     self.net = net  # PyTorch Module
     self.pi_space = n_bandits
     self.prob = bandit_prob
     self.n_envs = n_envs
     self.softmax = bandit.Bandit().softmax
     self.bootstrap = bootstrap
Exemple #3
0
def run_experiment(mu, N, agent):
    """
    Runs the expirement

    Inputs:
    mu - numpy array of means for bandaits
    N - number of turns
    agent - a class of agent with methods choose_bandit and update.

    Output
    """
    # Make bandits
    n_bandits = len(mu)
    bandits = list()
    for i in range(n_bandits):
        bandits.append(bandit.Bandit(mu[i]))

    # Reward vector (could leave this to the agent)
    rewards = np.zeros(N)

    # Run simulation
    for i in range(N):
        j = agent.choose_bandit()
        reward = bandits[j].pull()
        agent.update(reward)
        rewards[i] = reward

    # Calculate average
    cumulative_average = np.cumsum(rewards) / (np.arange(N) + 1)
    return (cumulative_average)
Exemple #4
0
def form():
    """
    Provide form with cumulated trial and success inputs for multiple options,
    return options with suggested budget share for next period
    """

    if request.method == 'POST':
        entries = [value for value in list(request.form.values()) if value]
        num_options = int(len(entries) / 2)
        options = pd.DataFrame([{
            'option': str(i + 1)
        } for i in range(num_options)])
        trials = [int(entries[i * 2]) for i in range(num_options)]
        successes = [int(entries[i * 2 + 1]) for i in range(num_options)]
        bandit = ban.Bandit(num_options=num_options, memory=False)
        for i in range(num_options):
            bandit.add_results(option_id=i,
                               trials=trials[i],
                               successes=successes[i])
        shares = choose(bandit=bandit, accelerate=False)
        options = format_results(options, shares)
        records = options.to_dict('records')
        columns = options.columns.values
        save_plot(bandit)
        return render_template('form_result.html',
                               records=records,
                               columns=columns,
                               plot='/static/images/plot.png')

    return render_template('form.html')
Exemple #5
0
    def __init__(self, epsilon):
        self.payoutModifier1 = 1.0
        self.payoutModifier2 = 2.0
        self.payoutModifier3 = 3.0
        self.iterations = 10000

        self.epsilon = epsilon

        self.results = [0, 0, 0]

        self.bandits = [
            bandit.Bandit(self.payoutModifier1),
            bandit.Bandit(self.payoutModifier2),
            bandit.Bandit(self.payoutModifier3)
        ]
        self.data = np.empty(self.iterations)
Exemple #6
0
 def new_envs(self):
     """
     Makes a new list of bandit environments.
     """
     envs = []
     for i in range(self.n_envs):
         rand = np.random.random()
         probs = [self.prob, 1 -
                  self.prob] if rand <= 0.5 else [1 - self.prob, self.prob]
         envs.append(bandit.Bandit(probs=probs))
     return envs
Exemple #7
0
def main():
    print("---- Starting.... ----")
    Nexp = 1000
    Npulls = 2000
    
    #=========== Epsilon Greedy Experiments (Nonstationary) ==========
    if(1):
        avg_outcome_RC1 = np.zeros(Npulls) 
        avg_optimal_arm_RC1 = np.zeros(Npulls)
        
        avg_outcome_eps1 = np.zeros(Npulls) 
        avg_optimal_arm_eps1 = np.zeros(Npulls)
        


        for i in range(Nexp): 
            bandit = bndt.Bandit(10) #10 armed bandit
            outcome_RC, arms_RC = experiment_RC(bandit,Npulls, alpha=0.1, beta=0.2)
            avg_outcome_RC1 += outcome_RC
            avg_optimal_arm_RC1 += arms_RC
            
            bandit = bndt_eps.Bandit(10) #10 armed bandit
            outcome_eps1, arms_eps1 = experiment_epsilonGreedy(bandit, 0.1, Npulls)
            avg_outcome_eps1 += outcome_eps1
            avg_optimal_arm_eps1 += arms_eps1
            

        avg_outcome_RC1 /= np.float(Nexp) 
        avg_optimal_arm_RC1 /= np.float(Nexp) 
        
        avg_outcome_eps1 /= np.float(Nexp) 
        avg_optimal_arm_eps1 /= np.float(Nexp)


        # plot results 
        plt.plot(avg_outcome_RC1,label="RC: a=0.1 b=0.2")
        plt.plot(avg_outcome_eps1,label="Eps: eps=0.1 a=1/k")
        plt.legend(loc=0)
        plt.title('Average Reward: Eps-Greedy vs Reinf. Comp. (Stationary Problem)')
        plt.ylabel('Average Reward')
        plt.xlabel('Number of pulls/plays')

        plt.figure()

        plt.plot(avg_optimal_arm_RC1*100.0, label='RC a=0.1 b=0.2')
        plt.plot(avg_optimal_arm_eps1*100.0, label='Eps eps=0.1 a=1/k')
        plt.ylim(0,100)
        plt.legend(loc=0)
        plt.title('Average %Optimal Arm Chosen: Eps-Greedy vs Reinf. Comp.(Stationary Problem)')
        plt.xlabel('Number of pulls/plays')
        plt.ylabel('Percent Optimal Arm')
        plt.show()
Exemple #8
0
def add_daily_results(data, num_options, memory, shape, cutoff, cut_level):
    """
    For each day, add a period with its option results to the Bandit
    """
    bandit = ban.Bandit(num_options, memory, shape, cutoff, cut_level)
    for i in range(cutoff + 1):
        bandit.add_period()
        daily_results = data.loc[data['date'] == datetime.date.today() -
                                 datetime.timedelta(days=cutoff - i)]
        for j in range(len(daily_results)):
            bandit.add_results(int(daily_results.iloc[j]['option_id']),
                               daily_results.iloc[j]['trials'],
                               daily_results.iloc[j]['successes'])
    return bandit
Exemple #9
0
def main():
    timesteps = int(sys.argv[1])
    b = bandit.Bandit()

    regret = 0.

    for t in range(timesteps):
        # Choose an arm
        a = 0

        # Pull the arm, obtain a reward
        ret = b.trigger(a)
        regret += b.opt() - ret

        # Learn from a and ret
        print('Reward', ret, 'regret', regret)
        continue
Exemple #10
0
 def create():
     return [bandit.Bandit(1.0), bandit.Bandit(2.0), bandit.Bandit(3.0)]
Exemple #11
0
def main():
    print("---- Starting.... ----")
    Nexp = 100
    Npulls = 1000
    
    #=========== Epsilon Greedy Experiments ==========
    if(1):
        avg_outcome_eps0p0 = np.zeros(Npulls) 
        avg_outcome_eps0p01 = np.zeros(Npulls) 
        avg_outcome_eps0p1 = np.zeros(Npulls)
        avg_optimal_arm_eps0p0 = np.zeros(Npulls)
        avg_optimal_arm_eps0p01 = np.zeros(Npulls)
        avg_optimal_arm_eps0p1 = np.zeros(Npulls)

        for i in range(Nexp): 
            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_eps0p0, arms_eps0p0 = experiment_epsilonGreedy(bandit,0.0,Npulls)
            avg_outcome_eps0p0 += outcome_eps0p0
            avg_optimal_arm_eps0p0 += arms_eps0p0

            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_eps0p01, arms_eps0p01 = experiment_epsilonGreedy(bandit,0.01,Npulls)
            avg_outcome_eps0p01 += outcome_eps0p01
            avg_optimal_arm_eps0p01 += arms_eps0p01

            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_eps0p1, arms_eps0p1 = experiment_epsilonGreedy(bandit,0.1,Npulls)
            avg_outcome_eps0p1 += outcome_eps0p1
            avg_optimal_arm_eps0p1 += arms_eps0p1 

        avg_outcome_eps0p0 /= np.float(Nexp) 
        avg_outcome_eps0p01 /= np.float(Nexp) 
        avg_outcome_eps0p1 /= np.float(Nexp)
        avg_optimal_arm_eps0p0 /= np.float(Nexp)
        avg_optimal_arm_eps0p01 /= np.float(Nexp)
        avg_optimal_arm_eps0p1 /= np.float(Nexp)

        # plot results 
        plt.plot(avg_outcome_eps0p0,label="eps = 0.0") 
        plt.plot(avg_outcome_eps0p01,label="eps = 0.01") 
        plt.plot(avg_outcome_eps0p1,label="eps = 0.1") 
        plt.ylim(0,2) 
        plt.legend()
        plt.title('N-arm bandit problem simulation (N=10) using epsilon-greedy')
        plt.ylabel('Average Reward')
        plt.xlabel('Number of pulls/plays')

        plt.figure()

        plt.plot(avg_optimal_arm_eps0p0*100.0, label='eps = 0.0')
        plt.plot(avg_optimal_arm_eps0p01*100.0, label='eps = 0.01')
        plt.plot(avg_optimal_arm_eps0p1*100.0, label='eps = 0.1')
        plt.ylim(0,100)
        plt.legend(loc=0)
        plt.title('Average Percent Optimal Arm Chosen')
        plt.xlabel('Number of pulls/plays')
        plt.ylabel('Percent Optimal Arm')
        plt.show()
        
    #========== Softmax experiments ==========
    if(0):
        print('Softmax with different temperatures')
        #avg_outcome_eps = np.zeros(Npulls) 
        #avg_optimal_arm_eps = np.zeros(Npulls)
        avg_outcome_softmax0 = np.zeros(Npulls)
        avg_optimal_arm_softmax0 = np.zeros(Npulls)
        avg_outcome_softmax1 = np.zeros(Npulls)
        avg_optimal_arm_softmax1 = np.zeros(Npulls)
        avg_outcome_softmax2 = np.zeros(Npulls)
        avg_optimal_arm_softmax2 = np.zeros(Npulls)
        avg_outcome_softmax3 = np.zeros(Npulls)
        avg_optimal_arm_softmax3 = np.zeros(Npulls)
        
        for i in range(Nexp): 
            # bandit = bndt.Bandit(10) #10 armed bandit 
            # outcome_eps, arms_eps = experiment_epsilonGreedy(bandit,0.0,Npulls)
            # avg_outcome_eps += outcome_eps
            # avg_optimal_arm_eps += arms_eps
            
            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_softmax, arms_softmax = experiment_softmax(bandit,0.01,Npulls)
            avg_outcome_softmax0 += outcome_softmax
            avg_optimal_arm_softmax0 += arms_softmax
            
            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_softmax, arms_softmax = experiment_softmax(bandit,0.1,Npulls)
            avg_outcome_softmax1 += outcome_softmax
            avg_optimal_arm_softmax1 += arms_softmax
            
            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_softmax, arms_softmax = experiment_softmax(bandit,1,Npulls)
            avg_outcome_softmax2 += outcome_softmax
            avg_optimal_arm_softmax2 += arms_softmax
            
            bandit = bndt.Bandit(10) #10 armed bandit 
            outcome_softmax, arms_softmax = experiment_softmax(bandit,10,Npulls)
            avg_outcome_softmax3 += outcome_softmax
            avg_optimal_arm_softmax3 += arms_softmax
            
        
        # avg_outcome_eps /= np.float(Nexp)
        # avg_optimal_arm_eps /= np.float(Nexp)
        
        avg_outcome_softmax0 /= np.float(Nexp)
        avg_optimal_arm_softmax0 /= np.float(Nexp)
        
        avg_outcome_softmax1 /= np.float(Nexp)
        avg_optimal_arm_softmax1 /= np.float(Nexp)
        
        avg_outcome_softmax2 /= np.float(Nexp)
        avg_optimal_arm_softmax2 /= np.float(Nexp)
        
        avg_outcome_softmax3 /= np.float(Nexp)
        avg_optimal_arm_softmax3 /= np.float(Nexp)

        
        # plot results 
        # plt.plot(avg_outcome_eps,label="eps = 0.1") 
        
        plt.plot(avg_outcome_softmax0,label="temp = 0.01") 
        plt.plot(avg_outcome_softmax1,label="temp = 0.1")
        plt.plot(avg_outcome_softmax2,label="temp = 1")
        plt.plot(avg_outcome_softmax3,label="temp = 10")
        plt.ylim(0,2) 
        plt.legend()
        plt.title('N-arm bandit problem simulation (N=10) using softmax')
        plt.ylabel('Average Reward')
        plt.xlabel('Number of pulls/plays')

        plt.figure()

        # plt.plot(avg_optimal_arm_eps*100.0, label='eps = 0.1')
        plt.plot(avg_optimal_arm_softmax0*100.0, label='temp = 0.01')
        plt.plot(avg_optimal_arm_softmax1*100.0, label='temp = 0.1')
        plt.plot(avg_optimal_arm_softmax2*100.0, label='temp = 1')
        plt.plot(avg_optimal_arm_softmax3*100.0, label='temp = 10')
        plt.ylim(0,100)
        plt.legend(loc=0)
        plt.title('Average Percent Optimal Arm Chosen')
        plt.xlabel('Number of pulls/plays')
        plt.ylabel('Percent Optimal Arm')
        plt.show()
Exemple #12
0
def simulate(method,
             periods,
             true_rates,
             deviation,
             change,
             trials,
             max_p=None,
             rounding=True,
             accelerate=True,
             memory=True,
             shape='linear',
             cutoff=28,
             cut_level=0.5):
    """
    Simulate option choosing and results adding for n periods
    and a given chooser, return respective successes with optimum and base
    """
    num_options = len(true_rates)

    rate_changes = [
        random.uniform(1 - change, 1 + change) for rate in true_rates
    ]

    # Initialize Split or Bandit instances
    if method == 'split':
        chooser = spl.Split(num_options=num_options)
    elif method == 'bandit':
        chooser = ban.Bandit(num_options=num_options,
                             memory=memory,
                             shape=shape,
                             cutoff=cutoff,
                             cut_level=cut_level)

    # For each period calculate and add successes for methods as well as
    # the optimal (max) and the random choice (base)
    successes = []
    max_successes = []
    base_successes = []
    for period in range(periods):
        # Calculate success rates under uncertainty (with deviation)
        rates = [
            min(
                max(
                    np.random.RandomState((i + 1) * (period + 1)).normal(
                        loc=rate * rate_changes[i]**period,
                        scale=rate * rate_changes[i]**period * deviation), 0),
                1) for i, rate in enumerate(true_rates)
        ]

        # Add results to Split or Bandit
        if method == 'split':
            successes.append(
                add_split_results(trials, max_p, rates, chooser, period,
                                  rounding))
        elif method == 'bandit':
            if memory:
                chooser.add_period()
            successes.append(
                add_bandit_results(num_options, trials, rates, chooser, period,
                                   rounding, accelerate))

        # Add results to max and base successes
        if period == 0:
            if rounding:
                max_successes = [round(trials * max(rates))]
                base_successes = [
                    np.sum([
                        round(trials / num_options * rates[i])
                        for i in range(num_options)
                    ])
                ]
            else:
                max_successes = [trials * max(rates)]
                base_successes = [
                    np.sum([
                        trials / num_options * rates[i]
                        for i in range(num_options)
                    ])
                ]
        else:
            if rounding:
                max_successes.append(max_successes[-1] +
                                     round(trials * max(rates)))
                base_successes.append(base_successes[-1] + np.sum([
                    round(trials / num_options * rates[i])
                    for i in range(num_options)
                ]))
            else:
                max_successes.append(max_successes[-1] + trials * max(rates))
                base_successes.append(base_successes[-1] + np.sum([
                    trials / num_options * rates[i] for i in range(num_options)
                ]))

    return [successes, max_successes, base_successes]