Beispiel #1
0
def main(directory = '../../data/test'):
    num_actions = 2

    #     # init with inverse variance
    # models = [RLogReg(D=NUM_FEATURES, Lambda=1) for cond in range(num_actions)]

    """ 
    Generate rewards for 2 bandits from normal(gaussian) distributions
    given mean and variance pairs
    """
    mean_list = [1., 1.5, 3., 2.]
#     v_list = [1., 2., 2., 1.]
    v_list = [3., 2., 2., 2.]

    reward_data_file = directory + '/sim_thompson_1_{0}.csv'
    dest_file = directory + '/sim_thompson_one_bandit_{0}.csv'
    for i in range(1,6):
        total_steps = (i+i) * 120
        input_thompson_reward_1 = generate_normal_distribution_file(mean_list[:num_actions], v_list[:num_actions], total_steps,        
                                                reward_data_file.format(total_steps))
#         input_thompson_reward_2 = generate_normal_distribution_file(mean_list[num_actions:], v_list[num_actions:], total_steps,
#                                                 '../../data/test/sim_thompson_2_{0}.csv'.format(total_steps))

        # calculate_thompson_single_bandit('simulated_single_bandit_input.csv', 3, 'simulated_single_bandit_thompson.csv')
        # calculate_thompson_single_bandit('contextual_single_bandit.csv', 3, 'contextual_single_bandit_thompson.csv', models)]]
        
        models = [ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(num_actions)]
        calculate_thompson_single_bandit(reward_data_file.format(total_steps), 
                                         num_actions=num_actions, 
                                         dest=dest_file.format(total_steps), 
                                         models=models, 
                                         action_mode=ActionSelectionMode.prob_is_best, 
                                         relearn=True)
Beispiel #2
0
def switch_bandit_random_thompson(immediate_input, true_input, immediate_output,
                                  true_output, time_step, action_mode,
                                  relearn=True, use_regression=False,
                                  num_actions=3, Lambda=1):
    '''
    Similar to switch_bandit_thompson except that Random policy is run on the immediate data
    instead and thompson takes over once the switch happens.
    :param relearn: At switch time, whether the algorithm will relearn from beginning.
    '''

    if use_regression:
        models = [RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)]
    else:
        # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)]
        models = [ng_normal.NGNormal(mu=0, v=1, alpha=1, beta=1) for _ in range(num_actions)]

    chosen_actions = calculate_random_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        forced=forced_actions(time_step))

    # Switch to true reward input, forcing actions taken previously
    calculate_thompson_single_bandit(
        true_input,
        num_actions,
        true_output,
        models,
        action_mode,
        forced_actions(actions=chosen_actions),
        relearn=relearn)
def run_simulations(num_sims,
                    mean_list,
                    variance,
                    step_sizes,
                    outfile_directory,
                    softmax_beta=None,
                    reordering_fn=None,
                    prior_mean=0,
                    forceActions=0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        for num_steps in step_sizes:
            if forceActions != 0:
                print("Forcing actions:", forceActions)
                forced = make_forced_actions(len(mean_list), num_steps,
                                             forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory,
                                                   num_steps, i)
            # Check if they've passed in one variance for everything or multiple variances
            if not hasattr(variance, '__len__'):
                # only one variance - turn into a list
                variances = [variance] * len(mean_list)
            else:
                # multiple variances - pass straight through
                variances = variance

            generate_single_bandit.generate_normal_distribution_file(
                mean_list, variances, num_steps, cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(
                    outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(
                    cur_reward_file, reordered_reward_file, reordering_fn,
                    softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps,
                                                  i)
            models = [
                ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1)
                for _ in range(len(mean_list))
            ]
            thompson_ng_policy.calculate_thompson_single_bandit(
                reordered_reward_file,
                num_actions=len(mean_list),
                dest=cur_output_file,
                models=models,
                action_mode=thompson_ng_policy.ActionSelectionMode.
                prob_is_best,
                relearn=True,
                forced=forced)
Beispiel #4
0
def switch_bandit_thompson(immediate_input,
                           true_input,
                           immediate_output,
                           true_output,
                           time_step,
                           action_mode,
                           relearn=True,
                           use_regression=False,
                           num_actions=3,
                           Lambda=1):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param action_mode: Indicates how to select actions, see ActionSelectionMode.
    :param relearn: At switch time, whether the algorithm will relearn from beginning.
    :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution.
    :param num_actions: The number of actions in this bandit.
    :param Lambda: The prior inverse variance of the regression weights if regression is used.
    '''

    if use_regression:
        models = [
            RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)
        ]
    else:
        # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)]
        models = [
            ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1)
            for _ in range(num_actions)
        ]

    # Run for 20 time steps on the immediate reward input
    chosen_actions, models = calculate_thompson_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        models,
        action_mode=action_mode,
        forced=forced_actions(time_step))

    # reset model state so that the algorithm forgets what happens
    for a in range(num_actions):
        models[a].reset_state()

    # Switch to true reward input, forcing actions taken previously
    calculate_thompson_single_bandit(true_input,
                                     num_actions,
                                     true_output,
                                     models,
                                     action_mode,
                                     forced_actions(actions=chosen_actions),
                                     relearn=relearn)
def non_parametric_confidence_interval(actions_df,
                                       stat_fn,
                                       prior,
                                       is_binary=True,
                                       num_permutations=5,
                                       epsilon=0,
                                       ci_size=.95,
                                       grid_size=.05,
                                       forced_actions=None):
    in_ci = []
    non_offset_tau_0 = 0
    for grid_offset in np.arange(-3, 3.001, grid_size):
        tau_0 = non_offset_tau_0 + grid_offset

        rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD]
        original_actions = actions_df.loc[:, H_ALGO_ACTION]
        rewards_mod = rewards.copy()
        rewards_mod.loc[original_actions ==
                        1] = rewards_mod.loc[original_actions == 1] - tau_0
        actual_stat = stat_fn(original_actions, rewards_mod)

        all_stats = []
        more_extreme_count = 0
        for i in range(num_permutations):
            if is_binary:
                models = [
                    beta_bernoulli.BetaBern(prior[0], prior[1])
                    for _ in range(num_actions)
                ]
            else:
                models = [
                    ng_normal.NGNormal(mu=prior[0],
                                       k=prior[1],
                                       alpha=prior[2],
                                       beta=prior[3])
                    for _ in range(num_actions)
                ]

            chosen_actions, models = calculate_thompson_single_bandit_permutation_testing(
                rewards,
                models,
                epsilon=epsilon,
                forced_actions=forced_actions)
            cur_stat = stat_fn(chosen_actions, rewards_mod)
            if cur_stat >= actual_stat:
                more_extreme_count += 1
            all_stats.append(cur_stat)
            if debug and (i % 100) == 0:
                print(i, "/ num_permutations:", more_extreme_count)
        pvalue = more_extreme_count / num_permutations
        if np.isnan(actual_stat):
            pvalue = np.nan
        if (1 - pvalue) <= ci_size:
            in_ci.append(tau_0)

    return in_ci
def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      prior_mean=0,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data,
    where the reward is present under the column reward_header. Runs for as many steps as it's able
    to gain samples
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1)
            for _ in range(num_actions)
        ]
        thompson_ng_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance
def run_simulations_uniform_random(num_sims,
                                   mean_list,
                                   variance,
                                   steps_before_switch,
                                   steps_after_switch,
                                   outfile_directory,
                                   forceActions=0,
                                   switch_to_best_if_nonsignificant=True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Samples uniformly at random.
    '''

    for i in range(num_sims):
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = make_forced_actions(len(mean_list), steps_before_switch,
                                         forceActions)
        else:
            forced = forced_actions()
        cur_reward_file = get_rewards_filename(
            outfile_directory, steps_before_switch + steps_after_switch, i)
        # Check if they've passed in one variance for everything or multiple variances
        if not hasattr(variance, '__len__'):
            # only one variance - turn into a list
            variances = [variance] * len(mean_list)
        else:
            # multiple variances - pass straight through
            variances = variance
        generate_single_bandit.generate_normal_distribution_file(
            mean_list, variances, steps_before_switch + steps_after_switch,
            cur_reward_file)
        #
        cur_output_file = get_output_filename(
            outfile_directory, steps_before_switch + steps_after_switch, i)
        models = [
            ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1)
            for _ in range(len(mean_list))
        ]

        thompson_ng_policy.calculate_thompson_switch_to_fixed_policy(
            cur_reward_file,
            num_actions=len(mean_list),
            dest=cur_output_file,
            num_actions_before_switch=steps_before_switch,
            models=models,
            switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant,
            epsilon=1.0,
            action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best,
            forced=forced)
def get_models_from_simulation(simulation_out_file, is_binary=True):
    df = pd.read_csv(simulation_out_file, header=1)
    last_row = df.iloc[df.shape[0] - 1, :]
    if is_binary:
        # Action1SuccessCount
        models = [
            beta_bernoulli.BetaBern(
                last_row.loc['Action' + str(i) + 'SuccessCount'],
                last_row.loc['Action' + str(i) + 'FailureCount'])
            for i in range(1, 3)
        ]
    else:
        models = [
            ng_normal.NGNormal(
                mu=last_row.loc['Action' + str(i) + 'EstimatedMu'],
                k=last_row.loc['Action' + str(i) + 'EstimatedVariance'],
                alpha=last_row.loc['Action' + str(i) + 'EstimatedAlpha'],
                beta=last_row.loc['Action' + str(i) + 'EstimatedBeta'])
            for i in range(1, 3)
        ]
    return models
def permutation_test(actions_df,
                     stat_fn,
                     prior,
                     is_binary=True,
                     num_permutations=5,
                     epsilon=0,
                     forced_actions=None):
    rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD]
    #"ObservedRewardofAction"
    original_actions = actions_df.loc[:, H_ALGO_ACTION]  #"AlgorithmAction"
    actual_stat = stat_fn(original_actions, rewards)

    all_stats = []
    more_extreme_count = 0
    for i in range(num_permutations):
        if is_binary:
            models = [
                beta_bernoulli.BetaBern(prior[0], prior[1])
                for _ in range(num_actions)
            ]
        else:
            models = [
                ng_normal.NGNormal(mu=prior[0],
                                   k=prior[1],
                                   alpha=prior[2],
                                   beta=prior[3]) for _ in range(num_actions)
            ]

        chosen_actions, models = calculate_thompson_single_bandit_permutation_testing(
            rewards, models, epsilon=epsilon, forced_actions=forced_actions)
        cur_stat = stat_fn(chosen_actions, rewards)
        if cur_stat >= actual_stat:
            more_extreme_count += 1
        all_stats.append(cur_stat)
        if debug and (i % 100) == 0:
            print(i, "/ num_permutations:", more_extreme_count)
    pvalue = more_extreme_count / num_permutations
    if np.isnan(actual_stat):
        pvalue = np.nan
    return pvalue, all_stats, actual_stat
Beispiel #10
0
def calculate_thompson_single_bandit(
        source,
        num_actions,
        dest,
        models=None,
        action_mode=ActionSelectionMode.prob_is_best,
        forced=forced_actions(),
        relearn=True):
    '''
    Calculates non-contextual thompson sampling actions and weights.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param models: models for each action's probability distribution.
    :param action_mode: Indicates how to select actions, see ActionSelectionMode.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously.
    '''
    # number of trials used to run Thompson Sampling to compute expectation stats
    # set to small value when debugging for faster speed
    num_trials_prob_best_action = int(100)

    if models == None:
        models = [
            ng_normal.NGNormal(mu=0, v=1, alpha=1, beta=1)
            for cond in range(num_actions)
        ]

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names = reader.fieldnames
        field_names_out, group_header = create_headers(field_names,
                                                       num_actions)

        print(','.join(group_header), file=outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        sample_number = 0
        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []

        for row in reader:
            sample_number += 1

            # get context features
            context = get_context(row)

            should_update_posterior = True

            if len(forced.actions) == 0 or sample_number > len(forced.actions):
                # first decide which arm we'd pull using Thompson
                # (do the random sampling, the max is the one we'd choose)
                samples = [
                    models[a].draw_expected_value(context)
                    for a in range(num_actions)
                ]

                if action_mode == ActionSelectionMode.prob_is_best:
                    # find the max of samples[i] etc and choose an arm
                    action = np.argmax(samples)
                else:
                    # take action in proportion to expected rewards
                    # draw samples and normalize to use as a discrete distribution
                    # action is taken by sampling from this discrete distribution
                    probs = samples / np.sum(samples)
                    rand = np.random.rand()
                    for a in range(num_actions):
                        if rand <= probs[a]:
                            action = a
                            break
                        rand -= probs[a]

            else:
                samples = [0 for a in range(num_actions)]
                # take forced action if requested
                action = forced.actions[sample_number - 1]

                if relearn == False:
                    should_update_posterior = False

            # get reward signals
            observed_rewards = [
                float(row[HEADER_ACTUALREWARD.format(a + 1)])
                for a in range(num_actions)
            ]
            reward = observed_rewards[action]

            if should_update_posterior:
                # update posterior distribution with observed reward
                models[action].update_posterior(context, reward)

            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)
                # save the model state in order so we can restore it
                # after switching to the true reward data.
                if sample_number == forced.time_step:
                    for a in range(num_actions):
                        models[a].save_state()

            # copy the input data to output file
            out_row = {}

            for i in range(len(reader.fieldnames)):
                out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]]
            ''' write performance data (e.g. regret) '''
            means = [
                float(row[HEADER_TRUEMEAN.format(a + 1)])
                for a in range(num_actions)
            ]

            optimal_action = int(row[HEADER_OPTIMALACTION]) - 1
            optimal_action_reward = means[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            # true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)]

            # # The oracle always chooses the best arm, thus expected reward
            # # is simply the probability of that arm getting a reward.
            # optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action
            #
            # # Run thompson sampling many times and calculate how much reward it would
            # # have gotten based on the chosen actions.
            chosen_action_counts = run_thompson_trial(
                context, num_trials_prob_best_action, num_actions, models)
            # expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions))

            optimal_expected_reward = means[
                optimal_action] * num_trials_prob_best_action
            expected_reward = np.sum(chosen_action_counts[a] * means[a]
                                     for a in range(num_actions))

            expected_regret = optimal_expected_reward - expected_reward
            cumulative_expected_regret += expected_regret

            write_performance(out_row, action, optimal_action, reward,
                              sample_regret, cumulative_sample_regret,
                              expected_regret, cumulative_expected_regret)

            write_parameters(out_row, action, samples, models,
                             chosen_action_counts, num_actions,
                             num_trials_prob_best_action)

            writer.writerow(out_row)

        return chosen_actions, models