Python NormalCB.step Examples

Programming Language: Python

Namespace/Package Name: src.environments.Bandit

Class/Type: NormalCB

Method/Function: step

Examples at hotexamples.com: 4

Python NormalCB.step - 4 examples found. These are the top rated real world Python examples of src.environments.Bandit.NormalCB.step extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NormalCB(7)

step(4)

expected_reward(2)

generate_mc_samples(2)

regret(2)

reset(2)

sample_from_posterior(1)

Example #1

Show file

File: rollout.py Project: lwu9/bayesRL

def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean,
                    tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps):

  score = 0
  rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
                         context_mean=estimated_context_mean, context_var=estimated_context_variance)
  for rep in range(monte_carlo_reps):
    rollout_env.reset()
    episode_score = 0

    # Initial assignments
    for t in range(10):
      for j in range(5):
        rollout_env.step(0)
      for j in range(5):
        rollout_env.step(1)

    for time in range(time_horizon):
      beta_hat = rollout_env.beta_hat_list
      sampling_cov_list = rollout_env.sampling_cov_list
      for j in range(nPatients):
        # Draw context and take action
        # context = context_sequence[time - current_time][j]
        action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function,
                        tuning_function_parameter, time_horizon, time, env)
        expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context)
        optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context)
                                          for a in range(rollout_env.number_of_actions)])
        rollout_env.step(action)

        # Update regret
        regret = (expected_reward - optimal_expected_reward)
        episode_score += regret

    print(rep)
    score += (episode_score - score) / (rep + 1)
  return score

Example #2

Show file

File: mHealth.py Project: lwu9/bayesRL

def episode(policy_name,
            label,
            save=False,
            points_per_grid_dimension=50,
            monte_carlo_reps=100):
    if save:
        base_name = 'mhealth-{}-{}'.format(label, policy_name)
        prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name)
        suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        filename = '{}_{}.yml'.format(prefix, suffix)

    np.random.seed(label)
    T = 10

    # ToDo: Create policy class that encapsulates this behavior
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.05  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = lambda a, t, c: 0.5 / (t + 1)
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(
        list_of_reward_betas=[np.array([1.0, 1.0]),
                              np.array([2.0, -2.0])])
    cumulative_regret = 0.0
    nPatients = 10
    env.reset()

    # Initial assignments
    for t in range(10):
        for j in range(5):
            env.step(0)
        for j in range(5):
            env.step(1)

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        if tune:
            tuning_function_parameter = opt.bayesopt(
                rollout.mHealth_rollout, policy, tuning_function,
                tuning_function_parameter, T, estimated_context_mean,
                estimated_context_variance, env, nPatients,
                points_per_grid_dimension, monte_carlo_reps)
        # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter)))
        for j in range(nPatients):
            x = copy.copy(env.curr_context)

            beta_hat = env.beta_hat_list
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            env.step(action)

            # Compute regret
            expected_rewards = [
                env.expected_reward(a, env.curr_context)
                for a in range(env.number_of_actions)
            ]
            expected_reward_at_action = expected_rewards[action]
            optimal_expected_reward = np.max(expected_rewards)
            regret = optimal_expected_reward - expected_reward_at_action
            cumulative_regret += regret

        # Save results
        if save:
            results = {'t': float(t), 'regret': float(cumulative_regret)}
            with open(filename, 'w') as outfile:
                yaml.dump(results, outfile)

    return cumulative_regret

Example #3

Show file

File: normal_contextual_bandit.py Project: lwu9/bayesRL

def episode(policy_name,
            label,
            n_patients=15,
            list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                  [-9.8, 0.6, 0.6, -0.4]],
            context_mean=np.array([0.0, 0.0, 0.0]),
            context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]),
            list_of_reward_vars=[1, 1],
            T=50,
            mc_replicates=1000,
            pre_simulate=True):
    np.random.seed(label)

    # ToDo: Create policy class that encapsulates this behavior
    posterior_sample = True
    bootstrap_posterior = False
    positive_zeta = False
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.1  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'random':
        tuning_function = lambda a, b, c: 1.0  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = tuned_bandit.expit_epsilon_decay
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = np.array([0.8, 46.38, 1.857])
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.expit_epsilon_decay
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        explore_ = {
            'zeta0': [1.0, 0.05, 1.0, 0.1],
            'zeta1': [30.0, 0.0, 1.0, 0.0],
            'zeta2': [0.1, 1.0, 0.01, 1.0]
        }
        bounds = {
            'zeta0': (0.025, 2.0),
            'zeta1': (0.0, 30.0),
            'zeta2': (0.01, 2)
        }
        tuning_function_parameter = np.array([0.05, 1.0, 0.01])
        posterior_sample = True
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts-decay-posterior-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
        posterior_sample = True
    elif policy_name == 'ts-decay-bootstrap-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
        posterior_sample = True
        bootstrap_posterior = True
    elif policy_name == 'ts-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.1
    elif policy_name == 'ucb-tune-posterior-sample':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_ucb_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
        posterior_sample = True
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(list_of_reward_betas=list_of_reward_betas,
                   context_mean=context_mean,
                   context_var=context_var,
                   list_of_reward_vars=list_of_reward_vars)
    #  env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25])
    cumulative_regret = 0.0
    # env.reset()
    tuning_parameter_sequence = []
    rewards = []
    actions = []

    # Using pre-simulated data
    # data_for_episode = env.generate_mc_samples(1, T)
    # rep_dict = data_for_episode[0]
    # initial_linear_model = rep_dict['initial_linear_model']
    # beta_hat_list = initial_linear_model['beta_hat_list']
    # Xprime_X_list = initial_linear_model['Xprime_X_list']
    # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list']
    # X_list = initial_linear_model['X_list']
    # y_list = initial_linear_model['y_list']
    # X_dot_y_list = initial_linear_model['X_dot_y_list']
    # sampling_cov_list = initial_linear_model['sampling_cov_list']
    # sigma_hat_list = initial_linear_model['sigma_hat_list']

    # context_sequence = rep_dict['contexts']
    # regrets_sequence = rep_dict['regrets']
    # rewards_sequence = rep_dict['rewards']

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        estimated_context_bounds = (np.min(X), np.max(X[:, 1:]))

        if tune:
            if pre_simulate:
                if posterior_sample:
                    gen_model_parameters = []
                    for rep in range(mc_replicates):
                        if bootstrap_posterior:
                            pass
                        else:
                            draws = env.sample_from_posterior()
                            # draws = env.sample_from_sampling_dist()
                        betas_for_each_action = []
                        vars_for_each_action = []
                        for a in range(env.number_of_actions):
                            beta_a = draws[a]['beta_draw']
                            var_a = draws[a]['var_draw']
                            betas_for_each_action.append(beta_a)
                            vars_for_each_action.append(var_a)
                        param_dict = {
                            'reward_betas': betas_for_each_action,
                            'reward_vars': vars_for_each_action,
                            'context_mean': draws['context_mu_draw'],
                            'context_var': draws['context_var_draw']
                        }
                        #                          'context_max': draws['context_max']}
                        gen_model_parameters.append(param_dict)
                else:
                    gen_model_parameters = None


#        sim_env = NormalUniformCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
#                                  context_bounds=estimated_context_bounds)
                sim_env = NormalCB(list_of_reward_betas=list_of_reward_betas,
                                   context_mean=context_mean,
                                   context_var=context_var,
                                   list_of_reward_vars=list_of_reward_vars)
                pre_simulated_data = sim_env.generate_mc_samples(
                    mc_replicates,
                    T,
                    n_patients=n_patients,
                    gen_model_params=gen_model_parameters)
                tuning_function_parameter = opt.bayesopt(
                    rollout.normal_cb_rollout_with_fixed_simulations,
                    policy,
                    tuning_function,
                    tuning_function_parameter,
                    T,
                    sim_env,
                    mc_replicates, {'pre_simulated_data': pre_simulated_data},
                    bounds,
                    explore_,
                    positive_zeta=positive_zeta)
                tuning_parameter_sequence.append(
                    [float(z) for z in tuning_function_parameter])
            else:
                tuning_function_parameter = tuned_bandit.random_search(
                    tuned_bandit.oracle_rollout, policy, tuning_function,
                    tuning_function_parameter, linear_model_results, T, t,
                    estimated_context_mean, estimated_context_variance, env)

        for patient in range(n_patients):
            x = copy.copy(env.curr_context)
            beta_hat = np.array([
                env.posterior_params_dict[a]['beta_post']
                for a in range(env.number_of_actions)
            ])
            # print(env.posterior_params_dict)
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            res = env.step(action)
            cumulative_regret += -env.regret(action, x)
            actions.append(action)
            u = res['Utility']
            rewards.append(u)
        print(beta_hat)

        if t == 0:
            break
    return {
        'cumulative_regret': cumulative_regret,
        'zeta_sequence': tuning_parameter_sequence,
        'rewards': rewards,
        'actions': actions
    }

Example #4

Show file

def episode(label,
            tuning_function_parameter,
            n_patients=1,
            list_of_reward_betas=[[-10, 0.4, 0.4, -0.4],
                                  [-9.8, 0.6, 0.6, -0.4]],
            context_mean=np.array([0.0, 0.0, 0.0]),
            context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]),
            list_of_reward_vars=[1, 1],
            T=30):

    tuning_function = tuned_bandit.expit_epsilon_decay
    policy = tuned_bandit.linear_cb_epsilon_greedy_policy

    env = NormalCB(1,
                   list_of_reward_betas=list_of_reward_betas,
                   context_mean=context_mean,
                   context_var=context_var,
                   list_of_reward_vars=list_of_reward_vars)
    #  env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25])
    cumulative_regret = 0.0
    # env.reset()
    print('epsilon', tuning_function(T, 0, tuning_function_parameter))
    tuning_parameter_sequence = []
    rewards = []
    actions = []

    # Using pre-simulated data
    # data_for_episode = env.generate_mc_samples(1, T)
    # rep_dict = data_for_episode[0]
    # initial_linear_model = rep_dict['initial_linear_model']
    # beta_hat_list = initial_linear_model['beta_hat_list']
    # Xprime_X_list = initial_linear_model['Xprime_X_list']
    # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list']
    # X_list = initial_linear_model['X_list']
    # y_list = initial_linear_model['y_list']
    # X_dot_y_list = initial_linear_model['X_dot_y_list']
    # sampling_cov_list = initial_linear_model['sampling_cov_list']
    # sigma_hat_list = initial_linear_model['sigma_hat_list']

    # context_sequence = rep_dict['contexts']
    # regrets_sequence = rep_dict['regrets']
    # rewards_sequence = rep_dict['rewards']

    for t in range(T):
        X = env.X

        for patient in range(n_patients):
            x = copy.copy(env.curr_context)
            beta_hat = np.array([
                env.posterior_params_dict[a]['beta_post']
                for a in range(env.number_of_actions)
            ])
            # print(env.posterior_params_dict)
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            res = env.step(action)
            cumulative_regret += -env.regret(action, x)
            actions.append(int(action))
            u = res['Utility']
            rewards.append(float(u))

    return {
        'cumulative_regret': float(cumulative_regret),
        'rewards': rewards,
        'actions': actions
    }