def switch_bandit_epsilon(immediate_input, true_input, \
    immediate_output, true_output, time_step, use_regression = False, num_actions = 3, epsilon = 0.2, Lambda = 1):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution.
    :param num_actions: The number of actions in this bandit.
    :param epsilon: Fraction of random exploration.
    :param Lambda: The prior inverse variance of the regression weights if regression is used.
    '''

    if use_regression:
        models = [
            RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)
        ]
    else:
        models = [Greedy() for _ in range(num_actions)]

    # Run for 20 time steps on the immediate reward input
    chosen_actions = calculate_epsilon_single_bandit(immediate_input, \
        num_actions, immediate_output, epsilon, models, forced = forced_actions(time_step))

    for m in models:
        m.reset_state()

    # Switch to true reward input, forcing actions taken previously
    calculate_epsilon_single_bandit(true_input, num_actions, true_output, \
        epsilon, models, forced = forced_actions(actions = chosen_actions))
Beispiel #2
0
def switch_bandit_random_ucb1(immediate_input,
                              true_input,
                              immediate_output,
                              true_output,
                              time_step,
                              num_actions=3,
                              relearn=True,
                              treat_forced_as_historical=False):
    '''
    Similar to switch_bandit_ucb1 except that Random policy is run on the immediate data
    instead and UCB1 takes over once the switch happens.
    :param relearn: At switch time, whether the algorithm will relearn from past data.
    '''

    chosen_actions = calculate_random_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        forced=forced_actions(time_step))

    # Switch to true reward input, forcing actions taken previously
    calculate_ucb1_single_bandit(
        true_input,
        num_actions,
        true_output,
        forced_actions(actions=chosen_actions),
        seed_rewards=None,
        relearn=relearn,
        treat_forced_as_historical=treat_forced_as_historical)
Beispiel #3
0
def switch_bandit_linucb(immediate_input, true_input, immediate_output,
                         true_output, time_step,
                         num_actions = 3, Lambda = 1):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param num_actions: The number of actions in this bandit.
    :param Lambda: The prior inverse variance of the regression weights if regression is used.
    '''

    models = [RLogReg(D = NUM_FEATURES, Lambda = Lambda) for _ in range(num_actions)]

    # Run for 20 time steps on the immediate reward input
    chosen_actions, models = calculate_linucb_single_bandit(
        immediate_input, num_actions,
        immediate_output, models, forced_actions(time_step))

    # reset model state so that the algorithm forgets what happens
    for a in range(num_actions):
        models[a].reset_state()

    # Switch to true reward input, forcing actions taken previously
    calculate_linucb_single_bandit(
        true_input, num_actions, true_output,
        models, forced_actions(actions = chosen_actions))
Beispiel #4
0
def switch_bandit_random_thompson(immediate_input, true_input, immediate_output,
                                  true_output, time_step, action_mode,
                                  relearn=True, use_regression=False,
                                  num_actions=3, Lambda=1):
    '''
    Similar to switch_bandit_thompson except that Random policy is run on the immediate data
    instead and thompson takes over once the switch happens.
    :param relearn: At switch time, whether the algorithm will relearn from beginning.
    '''

    if use_regression:
        models = [RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)]
    else:
        # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)]
        models = [NIGNormal(mu=0, v=1, alpha=1, beta=1) for _ in range(num_actions)]

    chosen_actions = calculate_random_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        forced=forced_actions(time_step))

    # Switch to true reward input, forcing actions taken previously
    calculate_thompson_single_bandit(
        true_input,
        num_actions,
        true_output,
        models,
        action_mode,
        forced_actions(actions=chosen_actions),
        relearn=relearn)
Beispiel #5
0
def switch_bandit_random(immediate_input,
                         true_input,
                         immediate_output,
                         true_output,
                         time_step,
                         num_actions=3):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param num_actions: The number of actions in this bandit.
    '''

    # Run for 20 time steps on the immediate reward input
    chosen_actions = calculate_random_single_bandit(immediate_input,
                                                    num_actions,
                                                    immediate_output,
                                                    forced_actions(time_step))

    # Switch to true reward input, forcing actions taken previously
    chosen_actions_2 = calculate_random_single_bandit(
        true_input, num_actions, true_output,
        forced_actions(actions=chosen_actions))

    return chosen_actions + chosen_actions_2
Beispiel #6
0
def switch_bandit_thompson(immediate_input,
                           true_input,
                           immediate_output,
                           true_output,
                           time_step,
                           action_mode,
                           relearn=True,
                           use_regression=False,
                           num_actions=3,
                           Lambda=1):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param action_mode: Indicates how to select actions, see ActionSelectionMode.
    :param relearn: At switch time, whether the algorithm will relearn from beginning.
    :param use_regression: Optional, indicate whether to use logistic regression to model reward distribution.
    :param num_actions: The number of actions in this bandit.
    :param Lambda: The prior inverse variance of the regression weights if regression is used.
    '''

    if use_regression:
        models = [
            RLogReg(D=NUM_FEATURES, Lambda=Lambda) for _ in range(num_actions)
        ]
    else:
        # models = [BetaBern(success=1, failure=1) for _ in range(num_actions)]
        models = [
            ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1)
            for _ in range(num_actions)
        ]

    # Run for 20 time steps on the immediate reward input
    chosen_actions, models = calculate_thompson_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        models,
        action_mode=action_mode,
        forced=forced_actions(time_step))

    # reset model state so that the algorithm forgets what happens
    for a in range(num_actions):
        models[a].reset_state()

    # Switch to true reward input, forcing actions taken previously
    calculate_thompson_single_bandit(true_input,
                                     num_actions,
                                     true_output,
                                     models,
                                     action_mode,
                                     forced_actions(actions=chosen_actions),
                                     relearn=relearn)
def run_simulations_uniform_random(num_sims, prob_per_arm, step_sizes, outfile_directory, forceActions = 0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        for num_steps in step_sizes:
            if forceActions != 0:
                print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps,        
                                                 cur_reward_file)
#        
            cur_output_file = get_output_filename(outfile_directory, num_steps, i)
            models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
            thompson_policy.calculate_thompson_single_bandit(cur_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         models=models, 
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
                                         epsilon = 1.0, 
                                         relearn=True,
                                         forced = forced)
def run_simulations(num_sims,
                    mean_list,
                    variance,
                    step_sizes,
                    outfile_directory,
                    softmax_beta=None,
                    reordering_fn=None,
                    prior_mean=0,
                    forceActions=0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        for num_steps in step_sizes:
            if forceActions != 0:
                print("Forcing actions:", forceActions)
                forced = make_forced_actions(len(mean_list), num_steps,
                                             forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory,
                                                   num_steps, i)
            # Check if they've passed in one variance for everything or multiple variances
            if not hasattr(variance, '__len__'):
                # only one variance - turn into a list
                variances = [variance] * len(mean_list)
            else:
                # multiple variances - pass straight through
                variances = variance

            generate_single_bandit.generate_normal_distribution_file(
                mean_list, variances, num_steps, cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(
                    outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(
                    cur_reward_file, reordered_reward_file, reordering_fn,
                    softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps,
                                                  i)
            models = [
                ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1)
                for _ in range(len(mean_list))
            ]
            thompson_ng_policy.calculate_thompson_single_bandit(
                reordered_reward_file,
                num_actions=len(mean_list),
                dest=cur_output_file,
                models=models,
                action_mode=thompson_ng_policy.ActionSelectionMode.
                prob_is_best,
                relearn=True,
                forced=forced)
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
      #  num_steps_prev = 0
        for num_steps in step_sizes:
            if forceActions != 0:
#                 print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps,        
                                                 cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(cur_reward_file, 
                                                                       reordered_reward_file, 
                                                                       reordering_fn, 
                                                                       softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps, i)
            models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            '''thompson_policy.calculate_thompson_single_bandit(reordered_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         models=models, 
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best, 
                                         relearn=True,
                                         forced = forced,
                                         batch_size = batch_size, 
                                         burn_in_size = burn_in_size)
            '''
            # num_steps_prev = num_steps
            thompson_policy.old_two_phase_random_thompson_policy(reordered_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         random_dur=0,
                                         models=models,
                                         random_start=0,
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best, 
                                         relearn=True,
                                         forced = forced,
                                         batch_size = batch_size, 
                                         burn_in_size = burn_in_size)
Beispiel #10
0
def switch_bandit_ucb1(immediate_input,
                       true_input,
                       immediate_output,
                       true_output,
                       time_step,
                       num_actions=3,
                       relearn=True,
                       treat_forced_as_historical=False,
                       use_sample_variance=False):
    '''
    Run the algorithm on immediate-reward input up to specified time step then switch to the true-reward input and
    recompute policy by keeping the previously taken actions and matching with true rewards instead.
    :param immediate_input: The immediate-reward input file.
    :param true_input: The true-reward input file.
    :param immediate_output: The result output file from applying the algorithm to the immediate input.
    :param true_output: The result output file from applying the algorithm to the true input.
    :param time_step: The time step to switch bandit.
    :param num_actions: The number of actions in this bandit.
    :param relearn: At switch time, whether the algorithm will relearn from past data.

    '''

    # Run for 20 time steps on the immediate reward input
    chosen_actions, rewards_at_switch = calculate_ucb1_single_bandit(
        immediate_input,
        num_actions,
        immediate_output,
        forced_actions(time_step),
        use_sample_variance=use_sample_variance)

    # Switch to true reward input, forcing actions taken previously
    # Also initializes model with that of before switching time.
    calculate_ucb1_single_bandit(
        true_input,
        num_actions,
        true_output,
        forced_actions(actions=chosen_actions),
        seed_rewards=None,
        relearn=relearn,
        treat_forced_as_historical=treat_forced_as_historical,
        use_sample_variance=use_sample_variance)
def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      prior_mean=0,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data,
    where the reward is present under the column reward_header. Runs for as many steps as it's able
    to gain samples
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1)
            for _ in range(num_actions)
        ]
        thompson_ng_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory,
    successPrior = 1, failurePrior = 1, softmax_beta = None,
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1,
    random_dur=0, random_start=0, mode='', epsilon = 0.1, resample = True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    csv_output_file_names = []
    sim_results_dfs_list = []

    for num_steps in step_sizes:
        sim_results = []
        for i in range(num_sims):
            if forceActions != 0:
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()

            if softmax_beta != None:
                # reorder rewards
                raise ValueError("softmax_beta is not supported in fast mode.")

            if mode=='uniform':
                models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
                random_dur = num_steps
            else:
                models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            sim_result, column_names,_ = \
                thompson_policy.two_phase_random_thompson_policy(
                            prob_per_arm=prob_per_arm,
                            users_count=num_steps,
                            random_dur=random_dur,#100,
                            models=models,
                            random_start=random_start,
                            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
                            relearn=True,
                            forced = forced,
                            batch_size = batch_size, epsilon=epsilon,
                            decreasing_epsilon=1)

            sim_results.extend(sim_result)

        sim_results_df = pd.DataFrame(sim_results, columns=column_names)
        sim_results_df.index = [idx for idx in range(num_steps)]*num_sims
        sim_results_dfs_list.append(sim_results_df)

        cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode)
        csv_output_file_names.append(cur_output_file)

    return sim_results_dfs_list, csv_output_file_names
Beispiel #13
0
def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      successPrior=1,
                                      failurePrior=1,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = run_effect_size_simulations.make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior)
            for _ in range(num_actions)
        ]
        thompson_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance
def run_simulations_uniform_random(num_sims,
                                   mean_list,
                                   variance,
                                   steps_before_switch,
                                   steps_after_switch,
                                   outfile_directory,
                                   forceActions=0,
                                   switch_to_best_if_nonsignificant=True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Samples uniformly at random.
    '''

    for i in range(num_sims):
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = make_forced_actions(len(mean_list), steps_before_switch,
                                         forceActions)
        else:
            forced = forced_actions()
        cur_reward_file = get_rewards_filename(
            outfile_directory, steps_before_switch + steps_after_switch, i)
        # Check if they've passed in one variance for everything or multiple variances
        if not hasattr(variance, '__len__'):
            # only one variance - turn into a list
            variances = [variance] * len(mean_list)
        else:
            # multiple variances - pass straight through
            variances = variance
        generate_single_bandit.generate_normal_distribution_file(
            mean_list, variances, steps_before_switch + steps_after_switch,
            cur_reward_file)
        #
        cur_output_file = get_output_filename(
            outfile_directory, steps_before_switch + steps_after_switch, i)
        models = [
            ng_normal.NGNormal(mu=0, k=1, alpha=1, beta=1)
            for _ in range(len(mean_list))
        ]

        thompson_ng_policy.calculate_thompson_switch_to_fixed_policy(
            cur_reward_file,
            num_actions=len(mean_list),
            dest=cur_output_file,
            num_actions_before_switch=steps_before_switch,
            models=models,
            switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant,
            epsilon=1.0,
            action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best,
            forced=forced)
Beispiel #15
0
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, c = 0.1, resample = True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        #  num_steps_prev = 0
        for num_steps in step_sizes:
            if forceActions != 0:
                #                 print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(
                    len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory,
                                                   num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps, cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(
                    outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(
                    cur_reward_file, reordered_reward_file, reordering_fn,
                    softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps,
                                                  i)
            models = [
                beta_bernoulli.BetaBern(success=successPrior,
                                        failure=failurePrior)
                for _ in range(len(prob_per_arm))
            ]

            #if don't pass model, then will be Greedy
            #thresh = 0.03
            #        thresh = 0.1 # for small effect, es = 0.1, 0.55 - 0.45 = 0.10
            ppd.calculate_epsilon_single_bandit(reordered_reward_file,
                                                models=models,
                                                num_actions=len(prob_per_arm),
                                                dest=cur_output_file,
                                                forced=forced,
                                                c=c,
                                                resample=resample)
def run_simulations_uniform_random_binary(
        num_sims,
        prob_per_arm,
        steps_before_switch,
        steps_after_switch,
        outfile_directory,
        forceActions=0,
        switch_to_best_if_nonsignificant=True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Samples uniformly at random.
    '''
    num_steps = steps_before_switch + steps_after_switch

    for i in range(num_sims):
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = run_effect_size_simulations.make_forced_actions(
                len(prob_per_arm), num_steps, forceActions)
        else:
            forced = forced_actions()

        cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
        generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps,
                                             cur_reward_file)
        #
        cur_output_file = get_output_filename(outfile_directory, num_steps, i)
        models = [
            beta_bernoulli.BetaBern(success=1, failure=1)
            for _ in range(len(prob_per_arm))
        ]
        thompson_policy.calculate_thompson_switch_to_fixed_policy(
            cur_reward_file,
            num_actions=len(prob_per_arm),
            dest=cur_output_file,
            num_actions_before_switch=steps_before_switch,
            models=models,
            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
            epsilon=1.0,
            switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant,
            forced=forced)
def make_forced_actions(num_actions, num_steps, num_actions_to_force = 5):
    '''
    Returns a forced actions object that forces an equal number of
    each action and where the number of forced actions may be based
    on the total number of steps. If num_actions_to_force is < 1,
    treats it as a proportion of the total number of steps. If a proportion
    is used, rounds up to next full trial. (E.g., the fewest number of forced actions
    of each type you'll ever have with a proportion is 1.)
    '''
#     print("num_actions:",num_actions)
#     print("num_steps:", num_steps)
#     print("num_actions_to_force:", num_actions_to_force)

    if num_actions_to_force < 1:
        num_actions_to_force = int(math.ceil(num_steps*num_actions_to_force))
    else:
        num_actions_to_force = int(math.ceil(num_actions_to_force))
    forced_action_counts = [num_actions_to_force for _ in range(num_actions)]
    action_list = [i for i in range(len(forced_action_counts)) for _ in range(forced_action_counts[i])]
    forced = forced_actions(actions=action_list)
    return forced
Beispiel #18
0
def calculate_linucb_single_bandit(source, num_actions, dest, models = None, forced = forced_actions()):
    '''
    Calculates LinUCB.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    '''
    if models is None:
        models = [RLogReg(D = NUM_FEATURES, Lambda = 1) for _ in range(num_actions)]

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names_out = create_headers(reader.fieldnames, num_actions, outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []

        alpha = 2

        # TODO: compute expected regret for LinUCB
        expected_regret = 0
        # number of trials used to compute expectation stats
        # set to small value when debugging for faster speed
        num_trials_prob_best_action = int(1e4)

        for sample_number, row in enumerate(reader, start=1):
            # get context features
            context = get_context(row)

            if len(forced.actions) == 0 or sample_number > len(forced.actions):
                # take action which maximizes the LinUCB bound based on current
                # model parameters (i.e. mean and variance of weight values)
                action = np.argmax([compute_linucb_bound(models[a], context, alpha) \
                    for a in range(num_actions)])
            else:
                samples = [0 for a in range(num_actions)]
                # take forced action if requested
                action = forced.actions[sample_number - 1]


            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)

            # get reward signals
            observed_rewards = [int(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions)]
            reward = observed_rewards[action]

            # update posterior distribution with observed reward
            # converted to range {-1,1}
            models[action].update_posterior(context, 2 * reward - 1)

            # copy the input data to output file
            out_row = {fieldname: row[fieldname] for fieldname in reader.fieldnames}

            ''' write performance data (e.g. regret) '''
            optimal_action = int(row[HEADER_OPTIMALACTION]) - 1
            optimal_action_reward = observed_rewards[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            out_row[H_ALGO_ACTION] = action + 1
            out_row[H_ALGO_OBSERVED_REWARD] = reward
            out_row[H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0
            out_row[H_ALGO_SAMPLE_REGRET] = sample_regret
            out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret

            true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)]

            # The oracle always chooses the best arm, thus expected reward
            # is simply the probability of that arm getting a reward.
            optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action

            cumulative_expected_regret += expected_regret

            out_row[H_ALGO_REGRET_EXPECTED] = expected_regret
            out_row[H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret

            writer.writerow(out_row)

        return chosen_actions, models
Beispiel #19
0
def calculate_thompson_single_bandit(source, num_actions, dest, models=None,
                                     action_mode=ActionSelectionMode.prob_is_best, forced=forced_actions(),
                                     relearn=True):
    '''
    Calculates non-contextual thompson sampling actions and weights.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param models: models for each action's probability distribution.
    :param action_mode: Indicates how to select actions, see ActionSelectionMode.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously.
    '''
    # number of trials used to run Thompson Sampling to compute expectation stats
    # set to small value when debugging for faster speed
    num_trials_prob_best_action = int(100)

    if models == None:
        models = [NIGNormal(mu=0, v=1, alpha=1, beta=1) for cond in range(num_actions)]

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names = reader.fieldnames
        field_names_out, group_header = create_headers(field_names, num_actions)

        print(','.join(group_header), file=outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        sample_number = 0
        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []

        for row in reader:
            sample_number += 1

            # get context features
            context = get_context(row)

            should_update_posterior = True

            if len(forced.actions) == 0 or sample_number > len(forced.actions):
                # first decide which arm we'd pull using Thompson
                # (do the random sampling, the max is the one we'd choose)
                samples = [models[a].draw_expected_value(context) for a in range(num_actions)]

                if action_mode == ActionSelectionMode.prob_is_best:
                    # find the max of samples[i] etc and choose an arm
                    action = np.argmax(samples)
                else:
                    # take action in proportion to expected rewards
                    # draw samples and normalize to use as a discrete distribution
                    # action is taken by sampling from this discrete distribution
                    probs = samples / np.sum(samples)
                    rand = np.random.rand()
                    for a in range(num_actions):
                        if rand <= probs[a]:
                            action = a
                            break
                        rand -= probs[a]

            else:
                samples = [0 for a in range(num_actions)]
                # take forced action if requested
                action = forced.actions[sample_number - 1]

                if relearn == False:
                    should_update_posterior = False

            # get reward signals
            observed_rewards = [float(row[HEADER_ACTUALREWARD.format(a + 1)]) for a in range(num_actions)]
            reward = observed_rewards[action]

            if should_update_posterior:
                # update posterior distribution with observed reward
                models[action].update_posterior(context, reward)

            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)
                # save the model state in order so we can restore it
                # after switching to the true reward data.
                if sample_number == forced.time_step:
                    for a in range(num_actions):
                        models[a].save_state()

            # copy the input data to output file
            out_row = {}

            for i in range(len(reader.fieldnames)):
                out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]]

            ''' write performance data (e.g. regret) '''
            means = [float(row[HEADER_TRUEMEAN.format(a + 1)]) for a in range(num_actions)]

            optimal_action = int(row[HEADER_OPTIMALACTION]) - 1
            optimal_action_reward = means[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            # true_probs = [float(row[HEADER_TRUEPROB.format(a + 1)]) for a in range(num_actions)]

            # # The oracle always chooses the best arm, thus expected reward
            # # is simply the probability of that arm getting a reward.
            # optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action
            #
            # # Run thompson sampling many times and calculate how much reward it would
            # # have gotten based on the chosen actions.
            chosen_action_counts = run_thompson_trial(context, num_trials_prob_best_action, num_actions, models)
            # expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions))

            optimal_expected_reward = means[optimal_action] * num_trials_prob_best_action
            expected_reward = np.sum(chosen_action_counts[a] * means[a] for a in range(num_actions))

            expected_regret = optimal_expected_reward - expected_reward
            cumulative_expected_regret += expected_regret

            write_performance(out_row, action, optimal_action, reward,
                              sample_regret, cumulative_sample_regret,
                              expected_regret, cumulative_expected_regret)

            write_parameters(out_row, action, samples, models,
                             chosen_action_counts, num_actions, num_trials_prob_best_action)

            writer.writerow(out_row)

        return chosen_actions, models
def calculate_epsilon_single_bandit(source,
                                    num_actions,
                                    dest,
                                    models=None,
                                    forced=forced_actions(),
                                    c=0.0,
                                    resample=True):
    '''
    Calculates contextual epsilon greedy algorithm.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param epsilon: fraction of time for random selection
    :param models: models for each action's probability distribution.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    '''

    if models == None:
        models = [Greedy() for _ in range(num_actions)]
        print("USING GREEDY MODEL")


#    print("in ts_ppd using c", c)
# number of trials used to run Thompson Sampling to compute expectation stats
# set to small value when debugging for faster speed
    num_trials_prob_best_action = int(1e4)

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names_out = create_headers(reader.fieldnames, num_actions, outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        sample_number = 0
        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []
        is_exploring = None

        for row in reader:
            sample_number += 1

            # get context features
            context = get_context(row)
            action_values = [-99, -99]
            if len(forced.actions) == 0 or sample_number > len(forced.actions):

                action_values = np.array([models[a].draw_expected_value(context) \
                    for a in range(num_actions)])

                diff = np.abs(action_values[0] - action_values[1])
                rand = np.random.rand()

                if diff < c:
                    #        print("exploring, diff, thresh", diff, thresh)
                    # take a random action
                    is_exploring = 1
                    action = np.random.randint(0, num_actions)
                else:
                    is_exploring = 0
                    #       print("TS, diff, thresh", diff, thresh)
                    #                    action_values = np.array([models[a].draw_expected_value(context) \
                    #                       for a in range(num_actions)])
                    if resample == True:
                        action_values = np.array([models[a].draw_expected_value(context) \
                            for a in range(num_actions)])
                    action = np.random.choice(
                        np.where(action_values == np.max(action_values))[0])
            else:
                # take forced action if requested
                action = forced.actions[sample_number - 1]

            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)

            # get reward signals
            observed_rewards = [
                int(row[HEADER_ACTUALREWARD.format(a + 1)])
                for a in range(num_actions)
            ]
            reward = observed_rewards[action]

            # update model state, reward is converted to {-1,1} to be compatible with all models
            models[action].update_posterior(context, 2 * reward - 1)

            # copy the input data to output file
            out_row = {}

            for i in range(len(reader.fieldnames)):
                out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]]
            ''' write performance data (e.g. regret) '''
            optimal_action = int(row[HEADER_OPTIMALACTION]) - 1
            optimal_action_reward = observed_rewards[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            out_row[H_ALGO_ACTION] = action + 1
            out_row[H_ALGO_OBSERVED_REWARD] = reward
            out_row[
                H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0
            out_row[H_ALGO_SAMPLE_REGRET] = sample_regret
            out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret
            out_row[H_ALGO_EXPLORING] = is_exploring
            out_row[H_ALGO_MEAN_1] = action_values[0]
            out_row[H_ALGO_MEAN_2] = action_values[1]

            # TODO: compute expected regret
            #true_probs = [float(row[H_DATA_TRUE_PROB.format(a + 1)]) for a in range(num_actions)]

            # The oracle always chooses the best arm, thus expected reward
            # is simply the probability of that arm getting a reward.
            #optimal_expected_reward = true_probs[optimal_action] * num_trials_prob_best_action

            # Run random sampling many times and calculate how much reward it would
            # have gotten based on the chosen actions.
            #chosen_action_counts = np.bincount(np.random.randint(0, num_actions, num_trials_prob_best_action))
            #expected_reward = np.sum(chosen_action_counts[a] * true_probs[a] for a in range(num_actions))

            #expected_regret = optimal_expected_reward - expected_reward
            #cumulative_expected_regret += expected_regret

            #out_row[H_ALGO_REGRET_EXPECTED] = expected_regret
            #out_row[H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret

            writer.writerow(out_row)

        return chosen_actions
def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory,
    successPrior = 1, failurePrior = 1, softmax_beta = None,
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1,
    random_dur=0, random_start=0, mode='', c = 0.1, resample = True, ns_stop = 0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    csv_output_file_names = []
    sim_results_dfs_list = []

    for num_steps in step_sizes:
        sim_results = []
        for i in range(num_sims):
            if forceActions != 0:
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()

            if softmax_beta != None:
                # reorder rewards
                raise ValueError("softmax_beta is not supported in fast mode.")

            if mode=='uniform':
                models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
                random_dur = num_steps
            else:
                models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            
            sim_result, column_names,_ = \
                thompson_policy.ppd_two_phase_random_thompson_policy(
                            prob_per_arm=prob_per_arm,
                            users_count=num_steps,
                            random_dur=random_dur,#100,
                            models=models,
                            random_start=random_start,
                            action_mode='Greedy',
                            relearn=True,
                            forced = forced,
                            batch_size = batch_size, c=c, resample = resample, ns_stop = ns_stop)

            # do ipw here? This is the equivalent of old acitons file(actions_df) 
#            sim_result_df = pd.DataFrame(sim_result, columns=column_names) #Not used yet

 #           calculate_ipw_by_step_size(actions_root = sim_result_df, num_samples=1000, num_actions = 2, cached_probs = {}, \
  #                                        prior = prior, binary_rewards = is_binary, config = config, n = n, num_sims = num_sims, batch_size = bs)

   #         print("sim_result_df", sim_result_df)
    #        print("shape", sim_result_df.shape)
     #       print("shape cols", sim_result_df.columns)
#            print(sim_result.columns())
            sim_results.extend(sim_result)

        sim_results_df = pd.DataFrame(sim_results, columns=column_names)
        sim_results_df.index = [idx for idx in range(num_steps)]*num_sims
        sim_results_dfs_list.append(sim_results_df)

        cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode)
        csv_output_file_names.append(cur_output_file)

    return sim_results_dfs_list, csv_output_file_names
Beispiel #22
0
def calculate_ucb1_single_bandit(source,
                                 num_actions,
                                 dest,
                                 forced=forced_actions(),
                                 seed_rewards=None,
                                 relearn=True,
                                 treat_forced_as_historical=False,
                                 use_sample_variance=False):
    '''
    Calculates non-contextual UCB1.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    :param seed_rewards: Optional, the initialized state of the model to start with (i.e. rewards received for each action).
    :param relearn: Optional, at switch time, whether algorithm relearns on previous time steps using actions taken previously.
    '''
    # number of trials used to compute expectation stats
    # set to small value when debugging for faster speed
    num_trials_prob_best_action = int(1e4)

    # constant header names for easy indexing

    # algorithm performance
    H_ALGO_ACTION = "AlgorithmAction"
    H_ALGO_OBSERVED_REWARD = "ObservedRewardofAction"
    H_ALGO_MATCH_OPTIMAL = "MatchesOptimalExpectedAction"
    H_ALGO_SAMPLE_REGRET = "SampleRegret"
    H_ALGO_SAMPLE_REGRET_CUMULATIVE = "CumulativeSampleRegret"
    H_ALGO_REGRET_EXPECTED = "ExpectedRegret"
    H_ALGO_REGRET_EXPECTED_CUMULATIVE = "CumulativeExpectedRegret"

    # if we're treating the past actions (from forced) as historical, then
    # need to record how many forced actions there were of each type
    if treat_forced_as_historical:
        arm_counts_from_history = [0] * num_actions

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names = reader.fieldnames
        field_names_out = field_names[:]
        field_names_out.extend([
            H_ALGO_ACTION, H_ALGO_OBSERVED_REWARD, H_ALGO_MATCH_OPTIMAL,
            H_ALGO_SAMPLE_REGRET, H_ALGO_SAMPLE_REGRET_CUMULATIVE,
            H_ALGO_REGRET_EXPECTED, H_ALGO_REGRET_EXPECTED_CUMULATIVE
        ])

        # print group-level headers for readability
        group_header = ['' for i in range(len(field_names_out))]
        group_header[0] = "Input Data"
        group_header[len(field_names)] = "Algorithm's Performance"
        print(','.join(group_header), file=outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        sample_number = 0
        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []

        # list of rewards gotten for each action
        if seed_rewards != None:
            rewards = seed_rewards
        else:
            rewards = [[] for _ in range(num_actions)]
        rewards_at_switch = []

        num_ucb_pulls = 0

        for row in reader:
            sample_number += 1

            should_update_rewards = True

            if len(forced.actions) == 0 or sample_number > len(forced.actions):
                num_ucb_pulls += 1
                if len(forced.actions) == 0 and num_ucb_pulls <= num_actions:
                    # initially play every action once
                    action = num_ucb_pulls - 1
                else:
                    action = -1
                    # This forces playing very action once; seems like above isn't necessary and may cause problems
                    for a in range(len(rewards)):
                        if len(rewards[a]) == 0:
                            action = a
                            break
                    if action == -1:
                        if treat_forced_as_historical:
                            # take action with max (avg reward + sqrt(2*log(# non historical arm choices + # of historical pulls of this arm) / (# times chosen + # historical pulls of this arm))
                            # note  that the number of times chosen plus the number of historical times chosen is exactly the total number of rewards recorded
                            #(that is, the only change with the historical version is to change the numerator)
                            #                             print("Historical: " + str([
                            #                                 np.mean(rewards_a) + \
                            #                                 np.sqrt(2.0 * np.log(num_ucb_pulls + historical_count) / len(rewards_a))
                            #                                 for rewards_a, historical_count in zip(rewards,arm_counts_from_history)]))
                            #                             print("Non-Historical: " +str([
                            #                                 np.mean(rewards_a) + \
                            #                                 np.sqrt(2.0 * np.log(sample_number) / len(rewards_a))
                            #                                 for rewards_a in rewards]))
                            #                             print("Variance: " + str([
                            #                                 np.mean(rewards_a) + \
                            #                                 np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) +\
                            #                                 3 * theta * np.log(num_ucb_pulls + historical_count) / len(rewards_a)
                            #                                 for rewards_a, historical_count in zip(rewards,arm_counts_from_history)]))
                            if use_sample_variance:
                                conf_bounds = [np.mean(rewards_a) + \
                                np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(num_ucb_pulls + historical_count) / len(rewards_a)) +\
                                3 * theta * np.log(num_ucb_pulls + historical_count) / len(rewards_a)
                                for rewards_a, historical_count in zip(rewards,arm_counts_from_history)]

                            else:
                                conf_bounds = [np.mean(rewards_a) + \
                                               np.sqrt(2.0 * np.log(num_ucb_pulls + historical_count) / len(rewards_a))
                                               for rewards_a, historical_count in zip(rewards,arm_counts_from_history)]
                        else:
                            if use_sample_variance:
                                conf_bounds = [np.mean(rewards_a) + \
                                               np.sqrt(2.0 * theta * np.var(rewards_a) * np.log(sample_number) / len(rewards_a)) + \
                                               3 * theta * np.log(sample_number) / len(rewards_a)
                                               for rewards_a in rewards]
                            else:
                                # take action with max (avg reward + sqrt(2*log(t) / # times chosen))
                                conf_bounds = [np.mean(rewards_a) + \
                                               np.sqrt(2.0 * np.log(sample_number) / len(rewards_a))
                                               for rewards_a in rewards]
                        action = np.argmax(conf_bounds)
            else:
                samples = [0 for a in range(num_actions)]
                # take forced action if requested
                action = forced.actions[sample_number - 1]

                if relearn == False:
                    should_update_rewards = False

            # get reward signals
            observed_rewards = [
                int(row[HEADER_ACTUALREWARD.format(a + 1)])
                for a in range(num_actions)
            ]
            reward = observed_rewards[action]

            if should_update_rewards:
                rewards[action].append(reward)

            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)

                if sample_number == forced.time_step:
                    rewards_at_switch = copy.deepcopy(rewards)

            # update history counts if necessary
            if treat_forced_as_historical and sample_number <= len(
                    forced.actions):
                arm_counts_from_history[action] += 1

            # copy the input data to output file
            out_row = {}

            for i in range(len(reader.fieldnames)):
                out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]]
            ''' write performance data (e.g. regret) '''
            optimal_action = int(row[HEADER_OPTIMALACTION]) - 1
            optimal_action_reward = observed_rewards[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            out_row[H_ALGO_ACTION] = action + 1
            out_row[H_ALGO_OBSERVED_REWARD] = reward
            out_row[
                H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0
            out_row[H_ALGO_SAMPLE_REGRET] = sample_regret
            out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret

            true_probs = [
                float(row[HEADER_TRUEPROB.format(a + 1)])
                for a in range(num_actions)
            ]

            # The oracle always chooses the best arm, thus expected reward
            # is simply the probability of that arm getting a reward.
            optimal_expected_reward = true_probs[
                optimal_action] * num_trials_prob_best_action

            # TODO: compute expected regret for UCB1
            expected_regret = 0
            cumulative_expected_regret += expected_regret

            out_row[H_ALGO_REGRET_EXPECTED] = expected_regret
            out_row[
                H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret

            writer.writerow(out_row)

        return chosen_actions, rewards_at_switch
Beispiel #23
0
def calculate_random_single_bandit(source,
                                   num_actions,
                                   dest,
                                   forced=forced_actions()):
    '''
    Calculates non-contextual random policy.
    :param source: simulated single-bandit data file with default rewards for each action and true probs.
    :param num_actions: number of actions for this bandit
    :param dest: outfile for printing the chosen actions and received rewards.
    :param forced: Optional, indicates to process only up to a certain time step or force take specified actions.
    '''
    # number of trials used to run Thompson Sampling to compute expectation stats
    # set to small value when debugging for faster speed
    num_trials_prob_best_action = int(1e4)

    # constant header names for easy indexing

    # data group from input file
    H_DATA_SAMPLE_NUMBER = "SampleNumber"
    H_DATA_AGE_GROUP = "agequartilesUSER"
    H_DATA_DAYS_ACTIVE = "ndaysactUSER"
    H_DATA_ACTUAL_REWARD = "Action{}OracleActualReward"
    H_DATA_TRUE_PROB = "Action{}OracleProbReward"
    H_DATA_OPTIMAL_ACTION = "ExpectedOptimalAction"

    # algorithm performance
    H_ALGO_ACTION = "AlgorithmAction"
    H_ALGO_OBSERVED_REWARD = "ObservedRewardofAction"
    H_ALGO_MATCH_OPTIMAL = "MatchesOptimalExpectedAction"
    H_ALGO_SAMPLE_REGRET = "SampleRegret"
    H_ALGO_SAMPLE_REGRET_CUMULATIVE = "CumulativeSampleRegret"
    H_ALGO_REGRET_EXPECTED = "ExpectedRegret"
    H_ALGO_REGRET_EXPECTED_CUMULATIVE = "CumulativeExpectedRegret"

    with open(source, newline='') as inf, open(dest, 'w', newline='') as outf:
        reader = csv.DictReader(inf)

        # Construct output column header names
        field_names = reader.fieldnames
        field_names_out = field_names[:]
        field_names_out.extend([
            H_ALGO_ACTION, H_ALGO_OBSERVED_REWARD, H_ALGO_MATCH_OPTIMAL,
            H_ALGO_SAMPLE_REGRET, H_ALGO_SAMPLE_REGRET_CUMULATIVE,
            H_ALGO_REGRET_EXPECTED, H_ALGO_REGRET_EXPECTED_CUMULATIVE
        ])

        # not important, store the position to write high level header to output file
        group_header_parameters_index = len(field_names_out)

        # print group-level headers for readability
        group_header = ['' for i in range(len(field_names_out))]
        group_header[0] = "Input Data"
        group_header[len(field_names)] = "Algorithm's Performance"
        print(','.join(group_header), file=outf)

        writer = csv.DictWriter(outf, fieldnames=field_names_out)
        writer.writeheader()

        sample_number = 0
        cumulative_sample_regret = 0
        cumulative_expected_regret = 0

        chosen_actions = []

        for row in reader:
            sample_number += 1

            if len(forced.actions) == 0 or sample_number > len(forced.actions):
                # take a random action
                action = np.random.randint(0, num_actions)
            else:
                # take forced action if requested
                action = forced.actions[sample_number - 1]

            # only return action chosen up to specified time step
            if forced.time_step > 0 and sample_number <= forced.time_step:
                chosen_actions.append(action)

            # get reward signals
            observed_rewards = [
                int(row[H_DATA_ACTUAL_REWARD.format(a + 1)])
                for a in range(num_actions)
            ]
            reward = observed_rewards[action]

            # copy the input data to output file
            out_row = {}

            for i in range(len(reader.fieldnames)):
                out_row[reader.fieldnames[i]] = row[reader.fieldnames[i]]
            ''' write performance data (e.g. regret) '''
            optimal_action = int(row[H_DATA_OPTIMAL_ACTION]) - 1
            optimal_action_reward = observed_rewards[optimal_action]
            sample_regret = optimal_action_reward - reward
            cumulative_sample_regret += sample_regret

            out_row[H_ALGO_ACTION] = action + 1
            out_row[H_ALGO_OBSERVED_REWARD] = reward
            out_row[
                H_ALGO_MATCH_OPTIMAL] = 1 if optimal_action == action else 0
            out_row[H_ALGO_SAMPLE_REGRET] = sample_regret
            out_row[H_ALGO_SAMPLE_REGRET_CUMULATIVE] = cumulative_sample_regret

            true_probs = [
                float(row[H_DATA_TRUE_PROB.format(a + 1)])
                for a in range(num_actions)
            ]

            # The oracle always chooses the best arm, thus expected reward
            # is simply the probability of that arm getting a reward.
            optimal_expected_reward = true_probs[
                optimal_action] * num_trials_prob_best_action

            # Run random sampling many times and calculate how much reward it would
            # have gotten based on the chosen actions.
            chosen_action_counts = np.bincount(
                np.random.randint(0, num_actions, num_trials_prob_best_action))
            expected_reward = np.sum(chosen_action_counts[a] * true_probs[a]
                                     for a in range(num_actions))

            expected_regret = optimal_expected_reward - expected_reward
            cumulative_expected_regret += expected_regret

            out_row[H_ALGO_REGRET_EXPECTED] = expected_regret
            out_row[
                H_ALGO_REGRET_EXPECTED_CUMULATIVE] = cumulative_expected_regret

            writer.writerow(out_row)

        return chosen_actions