def main(directory = '../../data/test'): num_actions = 2 # # init with inverse variance # models = [RLogReg(D=NUM_FEATURES, Lambda=1) for cond in range(num_actions)] """ Generate rewards for 2 bandits from normal(gaussian) distributions given mean and variance pairs """ mean_list = [1., 1.5, 3., 2.] # v_list = [1., 2., 2., 1.] v_list = [3., 2., 2., 2.] reward_data_file = directory + '/sim_thompson_1_{0}.csv' dest_file = directory + '/sim_thompson_one_bandit_{0}.csv' for i in range(1,6): total_steps = (i+i) * 120 input_thompson_reward_1 = generate_normal_distribution_file(mean_list[:num_actions], v_list[:num_actions], total_steps, reward_data_file.format(total_steps)) # input_thompson_reward_2 = generate_normal_distribution_file(mean_list[num_actions:], v_list[num_actions:], total_steps, # '../../data/test/sim_thompson_2_{0}.csv'.format(total_steps)) # calculate_thompson_single_bandit('simulated_single_bandit_input.csv', 3, 'simulated_single_bandit_thompson.csv') # calculate_thompson_single_bandit('contextual_single_bandit.csv', 3, 'contextual_single_bandit_thompson.csv', models)]] models = [ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(num_actions)] calculate_thompson_single_bandit(reward_data_file.format(total_steps), num_actions=num_actions, dest=dest_file.format(total_steps), models=models, action_mode=ActionSelectionMode.prob_is_best, relearn=True)
def switch_bandit_random_thompson(immediate_input, true_input, immediate_output, true_output, time_step, action_mode, relearn=True, use_regression=False, num_actions=3, Lambda=1): ''' Similar to switch_bandit_thompson except that Random policy is run on the immediate data instead and thompson takes over once the switch happens. :param relearn: At switch time, whether the algorithm will relearn from beginning. ''' if use_regression: models = [RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)] else: # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)] models = [ng_normal.NGNormal(mu=0, v=1, alpha=1, beta=1) for _ in range(num_actions)] chosen_actions = calculate_random_single_bandit( immediate_input, num_actions, immediate_output, forced=forced_actions(time_step)) # Switch to true reward input, forcing actions taken previously calculate_thompson_single_bandit( true_input, num_actions, true_output, models, action_mode, forced_actions(actions=chosen_actions), relearn=relearn)
def run_simulations(num_sims, mean_list, variance, step_sizes, outfile_directory, softmax_beta=None, reordering_fn=None, prior_mean=0, forceActions=0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): for num_steps in step_sizes: if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions(len(mean_list), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) # Check if they've passed in one variance for everything or multiple variances if not hasattr(variance, '__len__'): # only one variance - turn into a list variances = [variance] * len(mean_list) else: # multiple variances - pass straight through variances = variance generate_single_bandit.generate_normal_distribution_file( mean_list, variances, num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename( outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile( cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1) for _ in range(len(mean_list)) ] thompson_ng_policy.calculate_thompson_single_bandit( reordered_reward_file, num_actions=len(mean_list), dest=cur_output_file, models=models, action_mode=thompson_ng_policy.ActionSelectionMode. prob_is_best, relearn=True, forced=forced)
def switch_bandit_thompson(immediate_input, true_input, immediate_output, true_output, time_step, action_mode, relearn=True, use_regression=False, num_actions=3, Lambda=1): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param action_mode: Indicates how to select actions, see ActionSelectionMode. :param relearn: At switch time, whether the algorithm will relearn from beginning. :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution. :param num_actions: The number of actions in this bandit. :param Lambda: The prior inverse variance of the regression weights if regression is used. ''' if use_regression: models = [ RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions) ] else: # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)] models = [ ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(num_actions) ] # Run for 20 time steps on the immediate reward input chosen_actions, models = calculate_thompson_single_bandit( immediate_input, num_actions, immediate_output, models, action_mode=action_mode, forced=forced_actions(time_step)) # reset model state so that the algorithm forgets what happens for a in range(num_actions): models[a].reset_state() # Switch to true reward input, forcing actions taken previously calculate_thompson_single_bandit(true_input, num_actions, true_output, models, action_mode, forced_actions(actions=chosen_actions), relearn=relearn)
def non_parametric_confidence_interval(actions_df, stat_fn, prior, is_binary=True, num_permutations=5, epsilon=0, ci_size=.95, grid_size=.05, forced_actions=None): in_ci = [] non_offset_tau_0 = 0 for grid_offset in np.arange(-3, 3.001, grid_size): tau_0 = non_offset_tau_0 + grid_offset rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD] original_actions = actions_df.loc[:, H_ALGO_ACTION] rewards_mod = rewards.copy() rewards_mod.loc[original_actions == 1] = rewards_mod.loc[original_actions == 1] - tau_0 actual_stat = stat_fn(original_actions, rewards_mod) all_stats = [] more_extreme_count = 0 for i in range(num_permutations): if is_binary: models = [ beta_bernoulli.BetaBern(prior[0], prior[1]) for _ in range(num_actions) ] else: models = [ ng_normal.NGNormal(mu=prior[0], k=prior[1], alpha=prior[2], beta=prior[3]) for _ in range(num_actions) ] chosen_actions, models = calculate_thompson_single_bandit_permutation_testing( rewards, models, epsilon=epsilon, forced_actions=forced_actions) cur_stat = stat_fn(chosen_actions, rewards_mod) if cur_stat >= actual_stat: more_extreme_count += 1 all_stats.append(cur_stat) if debug and (i % 100) == 0: print(i, "/ num_permutations:", more_extreme_count) pvalue = more_extreme_count / num_permutations if np.isnan(actual_stat): pvalue = np.nan if (1 - pvalue) <= ci_size: in_ci.append(tau_0) return in_ci
def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, prior_mean=0, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data, where the reward is present under the column reward_header. Runs for as many steps as it's able to gain samples ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1) for _ in range(num_actions) ] thompson_ng_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance
def run_simulations_uniform_random(num_sims, mean_list, variance, steps_before_switch, steps_after_switch, outfile_directory, forceActions=0, switch_to_best_if_nonsignificant=True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Samples uniformly at random. ''' for i in range(num_sims): if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions(len(mean_list), steps_before_switch, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename( outfile_directory, steps_before_switch + steps_after_switch, i) # Check if they've passed in one variance for everything or multiple variances if not hasattr(variance, '__len__'): # only one variance - turn into a list variances = [variance] * len(mean_list) else: # multiple variances - pass straight through variances = variance generate_single_bandit.generate_normal_distribution_file( mean_list, variances, steps_before_switch + steps_after_switch, cur_reward_file) # cur_output_file = get_output_filename( outfile_directory, steps_before_switch + steps_after_switch, i) models = [ ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(len(mean_list)) ] thompson_ng_policy.calculate_thompson_switch_to_fixed_policy( cur_reward_file, num_actions=len(mean_list), dest=cur_output_file, num_actions_before_switch=steps_before_switch, models=models, switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant, epsilon=1.0, action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best, forced=forced)
def get_models_from_simulation(simulation_out_file, is_binary=True): df = pd.read_csv(simulation_out_file, header=1) last_row = df.iloc[df.shape[0] - 1, :] if is_binary: # Action1SuccessCount models = [ beta_bernoulli.BetaBern( last_row.loc['Action' + str(i) + 'SuccessCount'], last_row.loc['Action' + str(i) + 'FailureCount']) for i in range(1, 3) ] else: models = [ ng_normal.NGNormal( mu=last_row.loc['Action' + str(i) + 'EstimatedMu'], k=last_row.loc['Action' + str(i) + 'EstimatedVariance'], alpha=last_row.loc['Action' + str(i) + 'EstimatedAlpha'], beta=last_row.loc['Action' + str(i) + 'EstimatedBeta']) for i in range(1, 3) ] return models
def permutation_test(actions_df, stat_fn, prior, is_binary=True, num_permutations=5, epsilon=0, forced_actions=None): rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD] #"ObservedRewardofAction" original_actions = actions_df.loc[:, H_ALGO_ACTION] #"AlgorithmAction" actual_stat = stat_fn(original_actions, rewards) all_stats = [] more_extreme_count = 0 for i in range(num_permutations): if is_binary: models = [ beta_bernoulli.BetaBern(prior[0], prior[1]) for _ in range(num_actions) ] else: models = [ ng_normal.NGNormal(mu=prior[0], k=prior[1], alpha=prior[2], beta=prior[3]) for _ in range(num_actions) ] chosen_actions, models = calculate_thompson_single_bandit_permutation_testing( rewards, models, epsilon=epsilon, forced_actions=forced_actions) cur_stat = stat_fn(chosen_actions, rewards) if cur_stat >= actual_stat: more_extreme_count += 1 all_stats.append(cur_stat) if debug and (i % 100) == 0: print(i, "/ num_permutations:", more_extreme_count) pvalue = more_extreme_count / num_permutations if np.isnan(actual_stat): pvalue = np.nan return pvalue, all_stats, actual_stat
def calculate_thompson_single_bandit( source, num_actions, dest, models=None, action_mode=ActionSelectionMode.prob_is_best, forced=forced_actions(), relearn=True): ''' Calculates non-contextual thompson sampling actions and weights. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param models: models for each action's probability distribution. :param action_mode: Indicates how to select actions, see ActionSelectionMode. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously. ''' # number of trials used to run Thompson Sampling to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(100) if models == None: models = [ ng_normal.NGNormal(mu=0, v=1, alpha=1, beta=1) for cond in range(num_actions) ] with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names = reader.fieldnames field_names_out, group_header = create_headers(field_names, num_actions) print(','.join(group_header), file=outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() sample_number = 0 cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] for row in reader: sample_number += 1 # get context features context = get_context(row) should_update_posterior = True if len(forced.actions) == 0 or sample_number > len(forced.actions): # first decide which arm we'd pull using Thompson # (do the random sampling, the max is the one we'd choose) samples = [ models[a].draw_expected_value(context) for a in range(num_actions) ] if action_mode == ActionSelectionMode.prob_is_best: # find the max of samples[i] etc and choose an arm action = np.argmax(samples) else: # take action in proportion to expected rewards # draw samples and normalize to use as a discrete distribution # action is taken by sampling from this discrete distribution probs = samples / np.sum(samples) rand = np.random.rand() for a in range(num_actions): if rand <= probs[a]: action = a break rand -= probs[a] else: samples = [0 for a in range(num_actions)] # take forced action if requested action = forced.actions[sample_number - 1] if relearn == False: should_update_posterior = False # get reward signals observed_rewards = [ float(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions) ] reward = observed_rewards[action] if should_update_posterior: # update posterior distribution with observed reward models[action].update_posterior(context, reward) # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) # save the model state in order so we can restore it # after switching to the true reward data. if sample_number == forced.time_step: for a in range(num_actions): models[a].save_state() # copy the input data to output file out_row = {} for i in range(len(reader.fieldnames)): out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]] ''' write performance data (e.g. regret) ''' means = [ float(row[HEADER_TRUEMEAN.format(a + 1)]) for a in range(num_actions) ] optimal_action = int(row[HEADER_OPTIMALACTION]) - 1 optimal_action_reward = means[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret # true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)] # # The oracle always chooses the best arm, thus expected reward # # is simply the probability of that arm getting a reward. # optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action # # # Run thompson sampling many times and calculate how much reward it would # # have gotten based on the chosen actions. chosen_action_counts = run_thompson_trial( context, num_trials_prob_best_action, num_actions, models) # expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions)) optimal_expected_reward = means[ optimal_action] * num_trials_prob_best_action expected_reward = np.sum(chosen_action_counts[a] * means[a] for a in range(num_actions)) expected_regret = optimal_expected_reward - expected_reward cumulative_expected_regret += expected_regret write_performance(out_row, action, optimal_action, reward, sample_regret, cumulative_sample_regret, expected_regret, cumulative_expected_regret) write_parameters(out_row, action, samples, models, chosen_action_counts, num_actions, num_trials_prob_best_action) writer.writerow(out_row) return chosen_actions, models