def create_models_binary(actions_df, prior, num_actions): assert num_actions == 2 all_models = [] cache_keys = [[] for _ in range(actions_df.shape[0])] action = 0 # print(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)]) # print('Failures------------') #print(actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)]) for action in range(num_actions): [ cache_keys[i].extend((successes, failures)) for (i, successes, failures) in zip( range(actions_df.shape[0]), actions_df.loc[:, H_ALGO_ACTION_SUCCESS.format(action + 1)], actions_df.loc[:, H_ALGO_ACTION_FAILURE.format(action + 1)]) ] # print((successes, failures)\ # for (successes,failures) in\ # zip(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)],\ # actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)])) cur_models = [beta_bernoulli.BetaBern(successes, failures)\ for (successes,failures) in\ zip(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)],\ actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)])] # add in the one for the prior cur_models.insert(0, beta_bernoulli.BetaBern(prior[0], prior[1])) all_models.append(cur_models) # Add in a cache key for the prior cache_keys.insert(0, prior * num_actions) return all_models, cache_keys
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, random_dur=0, random_start=0, mode='', epsilon = 0.1, resample = True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' csv_output_file_names = [] sim_results_dfs_list = [] for num_steps in step_sizes: sim_results = [] for i in range(num_sims): if forceActions != 0: forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() if softmax_beta != None: # reorder rewards raise ValueError("softmax_beta is not supported in fast mode.") if mode=='uniform': models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] random_dur = num_steps else: models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] sim_result, column_names,_ = \ thompson_policy.two_phase_random_thompson_policy( prob_per_arm=prob_per_arm, users_count=num_steps, random_dur=random_dur,#100, models=models, random_start=random_start, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, epsilon=epsilon, decreasing_epsilon=1) sim_results.extend(sim_result) sim_results_df = pd.DataFrame(sim_results, columns=column_names) sim_results_df.index = [idx for idx in range(num_steps)]*num_sims sim_results_dfs_list.append(sim_results_df) cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode) csv_output_file_names.append(cur_output_file) return sim_results_dfs_list, csv_output_file_names
def run_simulations_uniform_random(num_sims, prob_per_arm, step_sizes, outfile_directory, forceActions = 0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): for num_steps in step_sizes: if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) # cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] thompson_policy.calculate_thompson_single_bandit(cur_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, epsilon = 1.0, relearn=True, forced = forced)
def non_parametric_confidence_interval(actions_df, stat_fn, prior, is_binary=True, num_permutations=5, epsilon=0, ci_size=.95, grid_size=.05, forced_actions=None): in_ci = [] non_offset_tau_0 = 0 for grid_offset in np.arange(-3, 3.001, grid_size): tau_0 = non_offset_tau_0 + grid_offset rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD] original_actions = actions_df.loc[:, H_ALGO_ACTION] rewards_mod = rewards.copy() rewards_mod.loc[original_actions == 1] = rewards_mod.loc[original_actions == 1] - tau_0 actual_stat = stat_fn(original_actions, rewards_mod) all_stats = [] more_extreme_count = 0 for i in range(num_permutations): if is_binary: models = [ beta_bernoulli.BetaBern(prior[0], prior[1]) for _ in range(num_actions) ] else: models = [ ng_normal.NGNormal(mu=prior[0], k=prior[1], alpha=prior[2], beta=prior[3]) for _ in range(num_actions) ] chosen_actions, models = calculate_thompson_single_bandit_permutation_testing( rewards, models, epsilon=epsilon, forced_actions=forced_actions) cur_stat = stat_fn(chosen_actions, rewards_mod) if cur_stat >= actual_stat: more_extreme_count += 1 all_stats.append(cur_stat) if debug and (i % 100) == 0: print(i, "/ num_permutations:", more_extreme_count) pvalue = more_extreme_count / num_permutations if np.isnan(actual_stat): pvalue = np.nan if (1 - pvalue) <= ci_size: in_ci.append(tau_0) return in_ci
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \ reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): # num_steps_prev = 0 for num_steps in step_sizes: if forceActions != 0: # print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename(outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile(cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] '''thompson_policy.calculate_thompson_single_bandit(reordered_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, burn_in_size = burn_in_size) ''' # num_steps_prev = num_steps thompson_policy.old_two_phase_random_thompson_policy(reordered_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, random_dur=0, models=models, random_start=0, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, burn_in_size = burn_in_size)
def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, successPrior=1, failurePrior=1, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(num_actions) ] thompson_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \ reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, c = 0.1, resample = True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): # num_steps_prev = 0 for num_steps in step_sizes: if forceActions != 0: # print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename( outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile( cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm)) ] #if don't pass model, then will be Greedy #thresh = 0.03 # thresh = 0.1 # for small effect, es = 0.1, 0.55 - 0.45 = 0.10 ppd.calculate_epsilon_single_bandit(reordered_reward_file, models=models, num_actions=len(prob_per_arm), dest=cur_output_file, forced=forced, c=c, resample=resample)
def run_simulations_uniform_random_binary( num_sims, prob_per_arm, steps_before_switch, steps_after_switch, outfile_directory, forceActions=0, switch_to_best_if_nonsignificant=True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Samples uniformly at random. ''' num_steps = steps_before_switch + steps_after_switch for i in range(num_sims): if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) # cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm)) ] thompson_policy.calculate_thompson_switch_to_fixed_policy( cur_reward_file, num_actions=len(prob_per_arm), dest=cur_output_file, num_actions_before_switch=steps_before_switch, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, epsilon=1.0, switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant, forced=forced)
def get_models_from_simulation(simulation_out_file, is_binary=True): df = pd.read_csv(simulation_out_file, header=1) last_row = df.iloc[df.shape[0] - 1, :] if is_binary: # Action1SuccessCount models = [ beta_bernoulli.BetaBern( last_row.loc['Action' + str(i) + 'SuccessCount'], last_row.loc['Action' + str(i) + 'FailureCount']) for i in range(1, 3) ] else: models = [ ng_normal.NGNormal( mu=last_row.loc['Action' + str(i) + 'EstimatedMu'], k=last_row.loc['Action' + str(i) + 'EstimatedVariance'], alpha=last_row.loc['Action' + str(i) + 'EstimatedAlpha'], beta=last_row.loc['Action' + str(i) + 'EstimatedBeta']) for i in range(1, 3) ] return models
def permutation_test(actions_df, stat_fn, prior, is_binary=True, num_permutations=5, epsilon=0, forced_actions=None): rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD] #"ObservedRewardofAction" original_actions = actions_df.loc[:, H_ALGO_ACTION] #"AlgorithmAction" actual_stat = stat_fn(original_actions, rewards) all_stats = [] more_extreme_count = 0 for i in range(num_permutations): if is_binary: models = [ beta_bernoulli.BetaBern(prior[0], prior[1]) for _ in range(num_actions) ] else: models = [ ng_normal.NGNormal(mu=prior[0], k=prior[1], alpha=prior[2], beta=prior[3]) for _ in range(num_actions) ] chosen_actions, models = calculate_thompson_single_bandit_permutation_testing( rewards, models, epsilon=epsilon, forced_actions=forced_actions) cur_stat = stat_fn(chosen_actions, rewards) if cur_stat >= actual_stat: more_extreme_count += 1 all_stats.append(cur_stat) if debug and (i % 100) == 0: print(i, "/ num_permutations:", more_extreme_count) pvalue = more_extreme_count / num_permutations if np.isnan(actual_stat): pvalue = np.nan return pvalue, all_stats, actual_stat
def switch_bandit_queue(immediate_input, true_input, immediate_output, true_output, time_step_switch, total_time_steps, num_actions = 3): #Reward info for samples samples_with_true_reward = read_reward_file(true_input, num_actions) samples_with_immediate_reward = read_reward_file(immediate_input, num_actions) # Store the samples we have but that haven't yet arrived samples = [] cur_sample_number = 0 # Keep track of what actions are chosen so we can write out the results at the end chosen_actions = [] sampling_distributions = [] #Queue algorithm variables num_samples = 1 mixing_weight = .01 # User defined mixing weight for how much to trust heuristic Policy (alpha in paper) queues = [[] for _ in range(num_actions)] # queues for holding samples max_queue_size = 1 # limit on queue size (B in paper) queue_sizes = np.zeros(num_actions) delays = [] # records how long it is between sample being selected and arriving (L in paper) models_heuristic = [beta_bernoulli.BetaBern(success = 1, failure = 1) for _ in range(num_actions)] # Thompson sampling stats for heuristic(immediate); h in paper models_base = [beta_bernoulli.BetaBern(success = 1, failure = 1) for _ in range(num_actions)] # Thompson sampling stats for base(delayed) heuristic_dist = get_thompson_sample_distribution(models_heuristic) # approx distribution over arms for heuristic; h in paper #base_dist = get_thompson_sample_distribution(models_base) # approx distribution over arms for base; p in paper arm_choice = get_thompson_arm_choice(models_base) #Draw first action choice from base distribution (I in paper) while cur_sample_number < total_time_steps: while len(queues[arm_choice]) != 0: reward = queues[arm_choice].pop(0) # Get the new reward queue_sizes[arm_choice] -= 1 # update base model with this reward # converted to range {-1,1} - this is based on standard thompson sampling doing this, although I don't see why it needs to #(looking at BetaBernoulli code, it doesn't need to) models_base[arm_choice].update_posterior(0, 2 * reward - 1) # resample arm_choice arm_choice = get_thompson_arm_choice(models_base) # resample base arm distribution base_dist = get_thompson_sample_distribution(models_base, arm_choice=arm_choice) heuristic_dist = get_thompson_sample_distribution(models_heuristic) # approx distribution over arms for heuristic; h in paper # for base_model,i in zip(models_base, range(len(models_base))): # print("Base model", i, "successes", base_model.success, "failures", base_model.failure) # for heuristic_model,i in zip(models_heuristic, range(len(models_heuristic))): # print("Heur model", i, "successes", heuristic_model.success, "failures", heuristic_model.failure) # sampling_dist = get_sampling_dist(heuristic_dist, base_dist, num_actions, arm_choice, queue_sizes, max_queue_size, mixing_weight) if sum(sampling_dist) < .995: print("Sampling dist not a probability distribution") sampling_dist = get_sampling_dist(heuristic_dist, base_dist, num_actions, arm_choice, queue_sizes, max_queue_size, mixing_weight) # sample from environment (paper notes one if online updates or else for one batch) for _ in range(num_samples): i = np.argmax(nprand.multinomial(1, sampling_dist)) # need to get this sample and put it in our samples list # sample stores when it was selected, the time step it will arrive, the arm choice, the immediate reward, and the final reward sample = (cur_sample_number, max(time_step_switch, cur_sample_number), i, 2*samples_with_immediate_reward[cur_sample_number][i]-1, 2*samples_with_true_reward[cur_sample_number][i] - 1) # observe the reward for the heuristic bandit if we aren't yet past the switch time (this isn't part of the paper) # we always observe it here because we'll always remove it when the sample arrives models_heuristic[i].update_posterior(0, sample[IMMEDIATE_REWARD]) # store the sample so we'll be able to check it when it arrives samples.append(sample) # store the action so we can write it out to report results chosen_actions.append(i) sampling_distributions.append(sampling_dist) # increment queue size and sample counts based on sample queue_sizes[i] += 1 cur_sample_number += 1 # now need to find out which of the samples have arrived - i.e., delayed reward has come in samples_to_remove = [] for sample in samples: # check if the arrival time is here if sample[ARRIVAL_TIME] <= cur_sample_number: # this sample has arrived - need to update the heuristic model, put it in a queue and mark it for removal queues[sample[ARM_INDEX]].append(sample[FINAL_REWARD]) samples_to_remove.append(sample) # update the heuristic by dropping immediate reward from model and adding final reward models_heuristic[sample[ARM_INDEX]].remove_from_model(0, sample[IMMEDIATE_REWARD]) models_heuristic[sample[ARM_INDEX]].update_posterior(0, sample[FINAL_REWARD]) # record the delay of this sample delays.append(cur_sample_number - sample[SAMPLE_TIME]) # remove any of the samples that arrived this time around for sample in samples_to_remove: samples.remove(sample) # set max_queue_size to maximum delay # Problem: can't take the max of an empty list # Bigger semantic problem: what should the maximum delay be if nothing has yet arrived? if len(delays) != 0: max_queue_size = max(delays) else: max_queue_size += num_samples # increment max queue size base on how many we've seen so far # At the end, we write out writeOutFile(true_input, true_output, chosen_actions, num_actions, sampling_distributions) writeOutFile(immediate_input, immediate_output, chosen_actions, num_actions, sampling_distributions)
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, random_dur=0, random_start=0, mode='', c = 0.1, resample = True, ns_stop = 0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' csv_output_file_names = [] sim_results_dfs_list = [] for num_steps in step_sizes: sim_results = [] for i in range(num_sims): if forceActions != 0: forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() if softmax_beta != None: # reorder rewards raise ValueError("softmax_beta is not supported in fast mode.") if mode=='uniform': models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] random_dur = num_steps else: models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] sim_result, column_names,_ = \ thompson_policy.ppd_two_phase_random_thompson_policy( prob_per_arm=prob_per_arm, users_count=num_steps, random_dur=random_dur,#100, models=models, random_start=random_start, action_mode='Greedy', relearn=True, forced = forced, batch_size = batch_size, c=c, resample = resample, ns_stop = ns_stop) # do ipw here? This is the equivalent of old acitons file(actions_df) # sim_result_df = pd.DataFrame(sim_result, columns=column_names) #Not used yet # calculate_ipw_by_step_size(actions_root = sim_result_df, num_samples=1000, num_actions = 2, cached_probs = {}, \ # prior = prior, binary_rewards = is_binary, config = config, n = n, num_sims = num_sims, batch_size = bs) # print("sim_result_df", sim_result_df) # print("shape", sim_result_df.shape) # print("shape cols", sim_result_df.columns) # print(sim_result.columns()) sim_results.extend(sim_result) sim_results_df = pd.DataFrame(sim_results, columns=column_names) sim_results_df.index = [idx for idx in range(num_steps)]*num_sims sim_results_dfs_list.append(sim_results_df) cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode) csv_output_file_names.append(cur_output_file) return sim_results_dfs_list, csv_output_file_names