def switch_bandit_epsilon(immediate_input, true_input, \ immediate_output, true_output, time_step, use_regression = False, num_actions = 3, epsilon = 0.2, Lambda = 1): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution. :param num_actions: The number of actions in this bandit. :param epsilon: Fraction of random exploration. :param Lambda: The prior inverse variance of the regression weights if regression is used. ''' if use_regression: models = [ RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions) ] else: models = [Greedy() for _ in range(num_actions)] # Run for 20 time steps on the immediate reward input chosen_actions = calculate_epsilon_single_bandit(immediate_input, \ num_actions, immediate_output, epsilon, models, forced = forced_actions(time_step)) for m in models: m.reset_state() # Switch to true reward input, forcing actions taken previously calculate_epsilon_single_bandit(true_input, num_actions, true_output, \ epsilon, models, forced = forced_actions(actions = chosen_actions))
def switch_bandit_random_ucb1(immediate_input, true_input, immediate_output, true_output, time_step, num_actions=3, relearn=True, treat_forced_as_historical=False): ''' Similar to switch_bandit_ucb1 except that Random policy is run on the immediate data instead and UCB1 takes over once the switch happens. :param relearn: At switch time, whether the algorithm will relearn from past data. ''' chosen_actions = calculate_random_single_bandit( immediate_input, num_actions, immediate_output, forced=forced_actions(time_step)) # Switch to true reward input, forcing actions taken previously calculate_ucb1_single_bandit( true_input, num_actions, true_output, forced_actions(actions=chosen_actions), seed_rewards=None, relearn=relearn, treat_forced_as_historical=treat_forced_as_historical)
def switch_bandit_linucb(immediate_input, true_input, immediate_output, true_output, time_step, num_actions = 3, Lambda = 1): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param num_actions: The number of actions in this bandit. :param Lambda: The prior inverse variance of the regression weights if regression is used. ''' models = [RLogReg(D = NUM_FEATURES, Lambda = Lambda) for _ in range(num_actions)] # Run for 20 time steps on the immediate reward input chosen_actions, models = calculate_linucb_single_bandit( immediate_input, num_actions, immediate_output, models, forced_actions(time_step)) # reset model state so that the algorithm forgets what happens for a in range(num_actions): models[a].reset_state() # Switch to true reward input, forcing actions taken previously calculate_linucb_single_bandit( true_input, num_actions, true_output, models, forced_actions(actions = chosen_actions))
def switch_bandit_random_thompson(immediate_input, true_input, immediate_output, true_output, time_step, action_mode, relearn=True, use_regression=False, num_actions=3, Lambda=1): ''' Similar to switch_bandit_thompson except that Random policy is run on the immediate data instead and thompson takes over once the switch happens. :param relearn: At switch time, whether the algorithm will relearn from beginning. ''' if use_regression: models = [RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)] else: # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)] models = [NIGNormal(mu=0, v=1, alpha=1, beta=1) for _ in range(num_actions)] chosen_actions = calculate_random_single_bandit( immediate_input, num_actions, immediate_output, forced=forced_actions(time_step)) # Switch to true reward input, forcing actions taken previously calculate_thompson_single_bandit( true_input, num_actions, true_output, models, action_mode, forced_actions(actions=chosen_actions), relearn=relearn)
def switch_bandit_random(immediate_input, true_input, immediate_output, true_output, time_step, num_actions=3): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param num_actions: The number of actions in this bandit. ''' # Run for 20 time steps on the immediate reward input chosen_actions = calculate_random_single_bandit(immediate_input, num_actions, immediate_output, forced_actions(time_step)) # Switch to true reward input, forcing actions taken previously chosen_actions_2 = calculate_random_single_bandit( true_input, num_actions, true_output, forced_actions(actions=chosen_actions)) return chosen_actions + chosen_actions_2
def switch_bandit_thompson(immediate_input, true_input, immediate_output, true_output, time_step, action_mode, relearn=True, use_regression=False, num_actions=3, Lambda=1): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param action_mode: Indicates how to select actions, see ActionSelectionMode. :param relearn: At switch time, whether the algorithm will relearn from beginning. :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution. :param num_actions: The number of actions in this bandit. :param Lambda: The prior inverse variance of the regression weights if regression is used. ''' if use_regression: models = [ RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions) ] else: # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)] models = [ ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(num_actions) ] # Run for 20 time steps on the immediate reward input chosen_actions, models = calculate_thompson_single_bandit( immediate_input, num_actions, immediate_output, models, action_mode=action_mode, forced=forced_actions(time_step)) # reset model state so that the algorithm forgets what happens for a in range(num_actions): models[a].reset_state() # Switch to true reward input, forcing actions taken previously calculate_thompson_single_bandit(true_input, num_actions, true_output, models, action_mode, forced_actions(actions=chosen_actions), relearn=relearn)
def run_simulations_uniform_random(num_sims, prob_per_arm, step_sizes, outfile_directory, forceActions = 0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): for num_steps in step_sizes: if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) # cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] thompson_policy.calculate_thompson_single_bandit(cur_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, epsilon = 1.0, relearn=True, forced = forced)
def run_simulations(num_sims, mean_list, variance, step_sizes, outfile_directory, softmax_beta=None, reordering_fn=None, prior_mean=0, forceActions=0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): for num_steps in step_sizes: if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions(len(mean_list), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) # Check if they've passed in one variance for everything or multiple variances if not hasattr(variance, '__len__'): # only one variance - turn into a list variances = [variance] * len(mean_list) else: # multiple variances - pass straight through variances = variance generate_single_bandit.generate_normal_distribution_file( mean_list, variances, num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename( outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile( cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1) for _ in range(len(mean_list)) ] thompson_ng_policy.calculate_thompson_single_bandit( reordered_reward_file, num_actions=len(mean_list), dest=cur_output_file, models=models, action_mode=thompson_ng_policy.ActionSelectionMode. prob_is_best, relearn=True, forced=forced)
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \ reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): # num_steps_prev = 0 for num_steps in step_sizes: if forceActions != 0: # print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename(outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile(cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] '''thompson_policy.calculate_thompson_single_bandit(reordered_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, burn_in_size = burn_in_size) ''' # num_steps_prev = num_steps thompson_policy.old_two_phase_random_thompson_policy(reordered_reward_file, num_actions=len(prob_per_arm), dest= cur_output_file, random_dur=0, models=models, random_start=0, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, burn_in_size = burn_in_size)
def switch_bandit_ucb1(immediate_input, true_input, immediate_output, true_output, time_step, num_actions=3, relearn=True, treat_forced_as_historical=False, use_sample_variance=False): ''' Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and recompute policy by keeping the previously taken actions and matching with true rewards instead. :param immediate_input: The immediate-reward input file. :param true_input: The true-reward input file. :param immediate_output: The result output file from applying the algorithm to the immediate input. :param true_output: The result output file from applying the algorithm to the true input. :param time_step: The time step to switch bandit. :param num_actions: The number of actions in this bandit. :param relearn: At switch time, whether the algorithm will relearn from past data. ''' # Run for 20 time steps on the immediate reward input chosen_actions, rewards_at_switch = calculate_ucb1_single_bandit( immediate_input, num_actions, immediate_output, forced_actions(time_step), use_sample_variance=use_sample_variance) # Switch to true reward input, forcing actions taken previously # Also initializes model with that of before switching time. calculate_ucb1_single_bandit( true_input, num_actions, true_output, forced_actions(actions=chosen_actions), seed_rewards=None, relearn=relearn, treat_forced_as_historical=treat_forced_as_historical, use_sample_variance=use_sample_variance)
def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, prior_mean=0, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data, where the reward is present under the column reward_header. Runs for as many steps as it's able to gain samples ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1) for _ in range(num_actions) ] thompson_ng_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, random_dur=0, random_start=0, mode='', epsilon = 0.1, resample = True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' csv_output_file_names = [] sim_results_dfs_list = [] for num_steps in step_sizes: sim_results = [] for i in range(num_sims): if forceActions != 0: forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() if softmax_beta != None: # reorder rewards raise ValueError("softmax_beta is not supported in fast mode.") if mode=='uniform': models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] random_dur = num_steps else: models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] sim_result, column_names,_ = \ thompson_policy.two_phase_random_thompson_policy( prob_per_arm=prob_per_arm, users_count=num_steps, random_dur=random_dur,#100, models=models, random_start=random_start, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced = forced, batch_size = batch_size, epsilon=epsilon, decreasing_epsilon=1) sim_results.extend(sim_result) sim_results_df = pd.DataFrame(sim_results, columns=column_names) sim_results_df.index = [idx for idx in range(num_steps)]*num_sims sim_results_dfs_list.append(sim_results_df) cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode) csv_output_file_names.append(cur_output_file) return sim_results_dfs_list, csv_output_file_names
def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, successPrior=1, failurePrior=1, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(num_actions) ] thompson_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance
def run_simulations_uniform_random(num_sims, mean_list, variance, steps_before_switch, steps_after_switch, outfile_directory, forceActions=0, switch_to_best_if_nonsignificant=True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Samples uniformly at random. ''' for i in range(num_sims): if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions(len(mean_list), steps_before_switch, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename( outfile_directory, steps_before_switch + steps_after_switch, i) # Check if they've passed in one variance for everything or multiple variances if not hasattr(variance, '__len__'): # only one variance - turn into a list variances = [variance] * len(mean_list) else: # multiple variances - pass straight through variances = variance generate_single_bandit.generate_normal_distribution_file( mean_list, variances, steps_before_switch + steps_after_switch, cur_reward_file) # cur_output_file = get_output_filename( outfile_directory, steps_before_switch + steps_after_switch, i) models = [ ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1) for _ in range(len(mean_list)) ] thompson_ng_policy.calculate_thompson_switch_to_fixed_policy( cur_reward_file, num_actions=len(mean_list), dest=cur_output_file, num_actions_before_switch=steps_before_switch, models=models, switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant, epsilon=1.0, action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best, forced=forced)
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \ reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, c = 0.1, resample = True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' for i in range(num_sims): # num_steps_prev = 0 for num_steps in step_sizes: if forceActions != 0: # print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) if softmax_beta != None: # reorder rewards reordered_reward_file = get_reordered_rewards_filename( outfile_directory, num_steps, i) reorder_samples_in_rewards.reorder_rewards_by_quartile( cur_reward_file, reordered_reward_file, reordering_fn, softmax_beta) else: reordered_reward_file = cur_reward_file cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm)) ] #if don't pass model, then will be Greedy #thresh = 0.03 # thresh = 0.1 # for small effect, es = 0.1, 0.55 - 0.45 = 0.10 ppd.calculate_epsilon_single_bandit(reordered_reward_file, models=models, num_actions=len(prob_per_arm), dest=cur_output_file, forced=forced, c=c, resample=resample)
def run_simulations_uniform_random_binary( num_sims, prob_per_arm, steps_before_switch, steps_after_switch, outfile_directory, forceActions=0, switch_to_best_if_nonsignificant=True): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Samples uniformly at random. ''' num_steps = steps_before_switch + steps_after_switch for i in range(num_sims): if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i) generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps, cur_reward_file) # cur_output_file = get_output_filename(outfile_directory, num_steps, i) models = [ beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm)) ] thompson_policy.calculate_thompson_switch_to_fixed_policy( cur_reward_file, num_actions=len(prob_per_arm), dest=cur_output_file, num_actions_before_switch=steps_before_switch, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, epsilon=1.0, switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant, forced=forced)
def make_forced_actions(num_actions, num_steps, num_actions_to_force = 5): ''' Returns a forced actions object that forces an equal number of each action and where the number of forced actions may be based on the total number of steps. If num_actions_to_force is < 1, treats it as a proportion of the total number of steps. If a proportion is used, rounds up to next full trial. (E.g., the fewest number of forced actions of each type you'll ever have with a proportion is 1.) ''' # print("num_actions:",num_actions) # print("num_steps:", num_steps) # print("num_actions_to_force:", num_actions_to_force) if num_actions_to_force < 1: num_actions_to_force = int(math.ceil(num_steps*num_actions_to_force)) else: num_actions_to_force = int(math.ceil(num_actions_to_force)) forced_action_counts = [num_actions_to_force for _ in range(num_actions)] action_list = [i for i in range(len(forced_action_counts)) for _ in range(forced_action_counts[i])] forced = forced_actions(actions=action_list) return forced
def calculate_linucb_single_bandit(source, num_actions, dest, models = None, forced = forced_actions()): ''' Calculates LinUCB. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. ''' if models is None: models = [RLogReg(D = NUM_FEATURES, Lambda = 1) for _ in range(num_actions)] with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names_out = create_headers(reader.fieldnames, num_actions, outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] alpha = 2 # TODO: compute expected regret for LinUCB expected_regret = 0 # number of trials used to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(1e4) for sample_number, row in enumerate(reader, start=1): # get context features context = get_context(row) if len(forced.actions) == 0 or sample_number > len(forced.actions): # take action which maximizes the LinUCB bound based on current # model parameters (i.e. mean and variance of weight values) action = np.argmax([compute_linucb_bound(models[a], context, alpha) \ for a in range(num_actions)]) else: samples = [0 for a in range(num_actions)] # take forced action if requested action = forced.actions[sample_number - 1] # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) # get reward signals observed_rewards = [int(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions)] reward = observed_rewards[action] # update posterior distribution with observed reward # converted to range {-1,1} models[action].update_posterior(context, 2 * reward - 1) # copy the input data to output file out_row = {fieldname: row[fieldname] for fieldname in reader.fieldnames} ''' write performance data (e.g. regret) ''' optimal_action = int(row[HEADER_OPTIMALACTION]) - 1 optimal_action_reward = observed_rewards[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret out_row[H_ALGO_ACTION] = action + 1 out_row[H_ALGO_OBSERVED_REWARD] = reward out_row[H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0 out_row[H_ALGO_SAMPLE_REGRET] = sample_regret out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)] # The oracle always chooses the best arm, thus expected reward # is simply the probability of that arm getting a reward. optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action cumulative_expected_regret += expected_regret out_row[H_ALGO_REGRET_EXPECTED] = expected_regret out_row[H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret writer.writerow(out_row) return chosen_actions, models
def calculate_thompson_single_bandit(source, num_actions, dest, models=None, action_mode=ActionSelectionMode.prob_is_best, forced=forced_actions(), relearn=True): ''' Calculates non-contextual thompson sampling actions and weights. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param models: models for each action's probability distribution. :param action_mode: Indicates how to select actions, see ActionSelectionMode. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously. ''' # number of trials used to run Thompson Sampling to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(100) if models == None: models = [NIGNormal(mu=0, v=1, alpha=1, beta=1) for cond in range(num_actions)] with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names = reader.fieldnames field_names_out, group_header = create_headers(field_names, num_actions) print(','.join(group_header), file=outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() sample_number = 0 cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] for row in reader: sample_number += 1 # get context features context = get_context(row) should_update_posterior = True if len(forced.actions) == 0 or sample_number > len(forced.actions): # first decide which arm we'd pull using Thompson # (do the random sampling, the max is the one we'd choose) samples = [models[a].draw_expected_value(context) for a in range(num_actions)] if action_mode == ActionSelectionMode.prob_is_best: # find the max of samples[i] etc and choose an arm action = np.argmax(samples) else: # take action in proportion to expected rewards # draw samples and normalize to use as a discrete distribution # action is taken by sampling from this discrete distribution probs = samples / np.sum(samples) rand = np.random.rand() for a in range(num_actions): if rand <= probs[a]: action = a break rand -= probs[a] else: samples = [0 for a in range(num_actions)] # take forced action if requested action = forced.actions[sample_number - 1] if relearn == False: should_update_posterior = False # get reward signals observed_rewards = [float(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions)] reward = observed_rewards[action] if should_update_posterior: # update posterior distribution with observed reward models[action].update_posterior(context, reward) # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) # save the model state in order so we can restore it # after switching to the true reward data. if sample_number == forced.time_step: for a in range(num_actions): models[a].save_state() # copy the input data to output file out_row = {} for i in range(len(reader.fieldnames)): out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]] ''' write performance data (e.g. regret) ''' means = [float(row[HEADER_TRUEMEAN.format(a + 1)]) for a in range(num_actions)] optimal_action = int(row[HEADER_OPTIMALACTION]) - 1 optimal_action_reward = means[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret # true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)] # # The oracle always chooses the best arm, thus expected reward # # is simply the probability of that arm getting a reward. # optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action # # # Run thompson sampling many times and calculate how much reward it would # # have gotten based on the chosen actions. chosen_action_counts = run_thompson_trial(context, num_trials_prob_best_action, num_actions, models) # expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions)) optimal_expected_reward = means[optimal_action] * num_trials_prob_best_action expected_reward = np.sum(chosen_action_counts[a] * means[a] for a in range(num_actions)) expected_regret = optimal_expected_reward - expected_reward cumulative_expected_regret += expected_regret write_performance(out_row, action, optimal_action, reward, sample_regret, cumulative_sample_regret, expected_regret, cumulative_expected_regret) write_parameters(out_row, action, samples, models, chosen_action_counts, num_actions, num_trials_prob_best_action) writer.writerow(out_row) return chosen_actions, models
def calculate_epsilon_single_bandit(source, num_actions, dest, models=None, forced=forced_actions(), c=0.0, resample=True): ''' Calculates contextual epsilon greedy algorithm. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param epsilon: fraction of time for random selection :param models: models for each action's probability distribution. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. ''' if models == None: models = [Greedy() for _ in range(num_actions)] print("USING GREEDY MODEL") # print("in ts_ppd using c", c) # number of trials used to run Thompson Sampling to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(1e4) with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names_out = create_headers(reader.fieldnames, num_actions, outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() sample_number = 0 cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] is_exploring = None for row in reader: sample_number += 1 # get context features context = get_context(row) action_values = [-99, -99] if len(forced.actions) == 0 or sample_number > len(forced.actions): action_values = np.array([models[a].draw_expected_value(context) \ for a in range(num_actions)]) diff = np.abs(action_values[0] - action_values[1]) rand = np.random.rand() if diff < c: # print("exploring, diff, thresh", diff, thresh) # take a random action is_exploring = 1 action = np.random.randint(0, num_actions) else: is_exploring = 0 # print("TS, diff, thresh", diff, thresh) # action_values = np.array([models[a].draw_expected_value(context) \ # for a in range(num_actions)]) if resample == True: action_values = np.array([models[a].draw_expected_value(context) \ for a in range(num_actions)]) action = np.random.choice( np.where(action_values == np.max(action_values))[0]) else: # take forced action if requested action = forced.actions[sample_number - 1] # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) # get reward signals observed_rewards = [ int(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions) ] reward = observed_rewards[action] # update model state, reward is converted to {-1,1} to be compatible with all models models[action].update_posterior(context, 2 * reward - 1) # copy the input data to output file out_row = {} for i in range(len(reader.fieldnames)): out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]] ''' write performance data (e.g. regret) ''' optimal_action = int(row[HEADER_OPTIMALACTION]) - 1 optimal_action_reward = observed_rewards[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret out_row[H_ALGO_ACTION] = action + 1 out_row[H_ALGO_OBSERVED_REWARD] = reward out_row[ H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0 out_row[H_ALGO_SAMPLE_REGRET] = sample_regret out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret out_row[H_ALGO_EXPLORING] = is_exploring out_row[H_ALGO_MEAN_1] = action_values[0] out_row[H_ALGO_MEAN_2] = action_values[1] # TODO: compute expected regret #true_probs = [float(row[H_DATA_TRUE_PROB.format(a + 1)]) for a in range(num_actions)] # The oracle always chooses the best arm, thus expected reward # is simply the probability of that arm getting a reward. #optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action # Run random sampling many times and calculate how much reward it would # have gotten based on the chosen actions. #chosen_action_counts = np.bincount(np.random.randint(0, num_actions, num_trials_prob_best_action)) #expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions)) #expected_regret = optimal_expected_reward - expected_reward #cumulative_expected_regret += expected_regret #out_row[H_ALGO_REGRET_EXPECTED] = expected_regret #out_row[H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret writer.writerow(out_row) return chosen_actions
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, random_dur=0, random_start=0, mode='', c = 0.1, resample = True, ns_stop = 0): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' csv_output_file_names = [] sim_results_dfs_list = [] for num_steps in step_sizes: sim_results = [] for i in range(num_sims): if forceActions != 0: forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions) else: forced = forced_actions() if softmax_beta != None: # reorder rewards raise ValueError("softmax_beta is not supported in fast mode.") if mode=='uniform': models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))] random_dur = num_steps else: models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))] sim_result, column_names,_ = \ thompson_policy.ppd_two_phase_random_thompson_policy( prob_per_arm=prob_per_arm, users_count=num_steps, random_dur=random_dur,#100, models=models, random_start=random_start, action_mode='Greedy', relearn=True, forced = forced, batch_size = batch_size, c=c, resample = resample, ns_stop = ns_stop) # do ipw here? This is the equivalent of old acitons file(actions_df) # sim_result_df = pd.DataFrame(sim_result, columns=column_names) #Not used yet # calculate_ipw_by_step_size(actions_root = sim_result_df, num_samples=1000, num_actions = 2, cached_probs = {}, \ # prior = prior, binary_rewards = is_binary, config = config, n = n, num_sims = num_sims, batch_size = bs) # print("sim_result_df", sim_result_df) # print("shape", sim_result_df.shape) # print("shape cols", sim_result_df.columns) # print(sim_result.columns()) sim_results.extend(sim_result) sim_results_df = pd.DataFrame(sim_results, columns=column_names) sim_results_df.index = [idx for idx in range(num_steps)]*num_sims sim_results_dfs_list.append(sim_results_df) cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode) csv_output_file_names.append(cur_output_file) return sim_results_dfs_list, csv_output_file_names
def calculate_ucb1_single_bandit(source, num_actions, dest, forced=forced_actions(), seed_rewards=None, relearn=True, treat_forced_as_historical=False, use_sample_variance=False): ''' Calculates non-contextual UCB1. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. :param seed_rewards: Optional, the initialized state of the model to start with (i.e. rewards received for each action). :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously. ''' # number of trials used to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(1e4) # constant header names for easy indexing # algorithm performance H_ALGO_ACTION = "AlgorithmAction" H_ALGO_OBSERVED_REWARD = "ObservedRewardofAction" H_ALGO_MATCH_OPTIMAL = "MatchesOptimalExpectedAction" H_ALGO_SAMPLE_REGRET = "SampleRegret" H_ALGO_SAMPLE_REGRET_CUMULATIVE = "CumulativeSampleRegret" H_ALGO_REGRET_EXPECTED = "ExpectedRegret" H_ALGO_REGRET_EXPECTED_CUMULATIVE = "CumulativeExpectedRegret" # if we're treating the past actions (from forced) as historical, then # need to record how many forced actions there were of each type if treat_forced_as_historical: arm_counts_from_history = [0] * num_actions with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names = reader.fieldnames field_names_out = field_names[:] field_names_out.extend([ H_ALGO_ACTION, H_ALGO_OBSERVED_REWARD, H_ALGO_MATCH_OPTIMAL, H_ALGO_SAMPLE_REGRET, H_ALGO_SAMPLE_REGRET_CUMULATIVE, H_ALGO_REGRET_EXPECTED, H_ALGO_REGRET_EXPECTED_CUMULATIVE ]) # print group-level headers for readability group_header = ['' for i in range(len(field_names_out))] group_header[0] = "Input Data" group_header[len(field_names)] = "Algorithm's Performance" print(','.join(group_header), file=outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() sample_number = 0 cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] # list of rewards gotten for each action if seed_rewards != None: rewards = seed_rewards else: rewards = [[] for _ in range(num_actions)] rewards_at_switch = [] num_ucb_pulls = 0 for row in reader: sample_number += 1 should_update_rewards = True if len(forced.actions) == 0 or sample_number > len(forced.actions): num_ucb_pulls += 1 if len(forced.actions) == 0 and num_ucb_pulls <= num_actions: # initially play every action once action = num_ucb_pulls - 1 else: action = -1 # This forces playing very action once; seems like above isn't necessary and may cause problems for a in range(len(rewards)): if len(rewards[a]) == 0: action = a break if action == -1: if treat_forced_as_historical: # take action with max (avg reward + sqrt(2*log(# non historical arm choices + # of historical pulls of this arm) / (# times chosen + # historical pulls of this arm)) # note that the number of times chosen plus the number of historical times chosen is exactly the total number of rewards recorded #(that is, the only change with the historical version is to change the numerator) # print("Historical: " + str([ # np.mean(rewards_a) + \ # np.sqrt(2.0 * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) # for rewards_a, historical_count in zip(rewards,arm_counts_from_history)])) # print("Non-Historical: " +str([ # np.mean(rewards_a) + \ # np.sqrt(2.0 * np.log(sample_number) / len(rewards_a)) # for rewards_a in rewards])) # print("Variance: " + str([ # np.mean(rewards_a) + \ # np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) +\ # 3 * theta * np.log(num_ucb_pulls + historical_count) / len(rewards_a) # for rewards_a, historical_count in zip(rewards,arm_counts_from_history)])) if use_sample_variance: conf_bounds = [np.mean(rewards_a) + \ np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) +\ 3 * theta * np.log(num_ucb_pulls + historical_count) / len(rewards_a) for rewards_a, historical_count in zip(rewards,arm_counts_from_history)] else: conf_bounds = [np.mean(rewards_a) + \ np.sqrt(2.0 * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) for rewards_a, historical_count in zip(rewards,arm_counts_from_history)] else: if use_sample_variance: conf_bounds = [np.mean(rewards_a) + \ np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(sample_number) / len(rewards_a)) + \ 3 * theta * np.log(sample_number) / len(rewards_a) for rewards_a in rewards] else: # take action with max (avg reward + sqrt(2*log(t) / # times chosen)) conf_bounds = [np.mean(rewards_a) + \ np.sqrt(2.0 * np.log(sample_number) / len(rewards_a)) for rewards_a in rewards] action = np.argmax(conf_bounds) else: samples = [0 for a in range(num_actions)] # take forced action if requested action = forced.actions[sample_number - 1] if relearn == False: should_update_rewards = False # get reward signals observed_rewards = [ int(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions) ] reward = observed_rewards[action] if should_update_rewards: rewards[action].append(reward) # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) if sample_number == forced.time_step: rewards_at_switch = copy.deepcopy(rewards) # update history counts if necessary if treat_forced_as_historical and sample_number <= len( forced.actions): arm_counts_from_history[action] += 1 # copy the input data to output file out_row = {} for i in range(len(reader.fieldnames)): out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]] ''' write performance data (e.g. regret) ''' optimal_action = int(row[HEADER_OPTIMALACTION]) - 1 optimal_action_reward = observed_rewards[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret out_row[H_ALGO_ACTION] = action + 1 out_row[H_ALGO_OBSERVED_REWARD] = reward out_row[ H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0 out_row[H_ALGO_SAMPLE_REGRET] = sample_regret out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret true_probs = [ float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions) ] # The oracle always chooses the best arm, thus expected reward # is simply the probability of that arm getting a reward. optimal_expected_reward = true_probs[ optimal_action] * num_trials_prob_best_action # TODO: compute expected regret for UCB1 expected_regret = 0 cumulative_expected_regret += expected_regret out_row[H_ALGO_REGRET_EXPECTED] = expected_regret out_row[ H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret writer.writerow(out_row) return chosen_actions, rewards_at_switch
def calculate_random_single_bandit(source, num_actions, dest, forced=forced_actions()): ''' Calculates non-contextual random policy. :param source: simulated single-bandit data file with default rewards for each action and true probs. :param num_actions: number of actions for this bandit :param dest: outfile for printing the chosen actions and received rewards. :param forced: Optional, indicates to process only up to a certain time step or force take specified actions. ''' # number of trials used to run Thompson Sampling to compute expectation stats # set to small value when debugging for faster speed num_trials_prob_best_action = int(1e4) # constant header names for easy indexing # data group from input file H_DATA_SAMPLE_NUMBER = "SampleNumber" H_DATA_AGE_GROUP = "agequartilesUSER" H_DATA_DAYS_ACTIVE = "ndaysactUSER" H_DATA_ACTUAL_REWARD = "Action{}OracleActualReward" H_DATA_TRUE_PROB = "Action{}OracleProbReward" H_DATA_OPTIMAL_ACTION = "ExpectedOptimalAction" # algorithm performance H_ALGO_ACTION = "AlgorithmAction" H_ALGO_OBSERVED_REWARD = "ObservedRewardofAction" H_ALGO_MATCH_OPTIMAL = "MatchesOptimalExpectedAction" H_ALGO_SAMPLE_REGRET = "SampleRegret" H_ALGO_SAMPLE_REGRET_CUMULATIVE = "CumulativeSampleRegret" H_ALGO_REGRET_EXPECTED = "ExpectedRegret" H_ALGO_REGRET_EXPECTED_CUMULATIVE = "CumulativeExpectedRegret" with open(source, newline='') as inf, open(dest, 'w', newline='') as outf: reader = csv.DictReader(inf) # Construct output column header names field_names = reader.fieldnames field_names_out = field_names[:] field_names_out.extend([ H_ALGO_ACTION, H_ALGO_OBSERVED_REWARD, H_ALGO_MATCH_OPTIMAL, H_ALGO_SAMPLE_REGRET, H_ALGO_SAMPLE_REGRET_CUMULATIVE, H_ALGO_REGRET_EXPECTED, H_ALGO_REGRET_EXPECTED_CUMULATIVE ]) # not important, store the position to write high level header to output file group_header_parameters_index = len(field_names_out) # print group-level headers for readability group_header = ['' for i in range(len(field_names_out))] group_header[0] = "Input Data" group_header[len(field_names)] = "Algorithm's Performance" print(','.join(group_header), file=outf) writer = csv.DictWriter(outf, fieldnames=field_names_out) writer.writeheader() sample_number = 0 cumulative_sample_regret = 0 cumulative_expected_regret = 0 chosen_actions = [] for row in reader: sample_number += 1 if len(forced.actions) == 0 or sample_number > len(forced.actions): # take a random action action = np.random.randint(0, num_actions) else: # take forced action if requested action = forced.actions[sample_number - 1] # only return action chosen up to specified time step if forced.time_step > 0 and sample_number <= forced.time_step: chosen_actions.append(action) # get reward signals observed_rewards = [ int(row[H_DATA_ACTUAL_REWARD.format(a + 1)]) for a in range(num_actions) ] reward = observed_rewards[action] # copy the input data to output file out_row = {} for i in range(len(reader.fieldnames)): out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]] ''' write performance data (e.g. regret) ''' optimal_action = int(row[H_DATA_OPTIMAL_ACTION]) - 1 optimal_action_reward = observed_rewards[optimal_action] sample_regret = optimal_action_reward - reward cumulative_sample_regret += sample_regret out_row[H_ALGO_ACTION] = action + 1 out_row[H_ALGO_OBSERVED_REWARD] = reward out_row[ H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0 out_row[H_ALGO_SAMPLE_REGRET] = sample_regret out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret true_probs = [ float(row[H_DATA_TRUE_PROB.format(a + 1)]) for a in range(num_actions) ] # The oracle always chooses the best arm, thus expected reward # is simply the probability of that arm getting a reward. optimal_expected_reward = true_probs[ optimal_action] * num_trials_prob_best_action # Run random sampling many times and calculate how much reward it would # have gotten based on the chosen actions. chosen_action_counts = np.bincount( np.random.randint(0, num_actions, num_trials_prob_best_action)) expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions)) expected_regret = optimal_expected_reward - expected_reward cumulative_expected_regret += expected_regret out_row[H_ALGO_REGRET_EXPECTED] = expected_regret out_row[ H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret writer.writerow(out_row) return chosen_actions