def fixed_pulls(num_initial_pulls): np.random.seed(num_initial_pulls) env = NormalCB(num_initial_pulls, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]), context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]), list_of_reward_vars=[1, 1]) p = bayes_optimize_zeta(num_initial_pulls, T=30, mc_rep=1000, list_of_reward_betas=env.beta_hat_list, context_mean=env.estimated_context_mean[1:], context_var=env.estimated_context_cov[1:, 1:], list_of_reward_vars=np.array(env.sigma_hat_list) ** 2) return {'num_initial_pulls': num_initial_pulls, 'theta_opt': [float(param) for param in p]}
def bayes_optimize_zeta(seed, mc_rep=1000, T=50, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([[0.0, 0.0, 0.0]]), context_var=np.eye(3), list_of_reward_vars=[1.0, 1.0]): np.random.seed(seed) sim_env = NormalCB(5, list_of_reward_betas=list_of_reward_betas, context_mean=context_mean, context_var=context_var, list_of_reward_vars=list_of_reward_vars) # env = NormalMAB(list_of_reward_mus=[0, 1], list_of_reward_vars=[1, 140]) # env = NormalMAB(list_of_reward_mus=[0.3, 0.6], list_of_reward_vars=[1**2, 1**2]) # X = env.X # estimated_context_mean = np.mean(X, axis=0) # estimated_context_variance = np.cov(X, rowvar=False) # estimated_context_bounds = (np.min(X), np.max(X)) # sim_env = NormalUniformCB(list_of_reward_betas=env.list_of_reward_betas, list_of_reward_vars=env.list_of_reward_vars, # context_bounds=env.context_bounds) # sim_env = NormalCB(list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]), # context_var=np.array([[1.0,0,0], [0,1.,0], [0, 0, 1.]]), list_of_reward_vars=[1, 1]) # sim_env = NormalMAB(list_of_reward_mus=env.list_of_reward_mus, list_of_reward_vars=env.list_of_reward_vars) pre_simulated_data = sim_env.generate_mc_samples(mc_rep, T) rollout_function_kwargs = {'pre_simulated_data': pre_simulated_data} # def objective(zeta0, zeta1, zeta2, zeta3, zeta4, zeta5, zeta6, zeta7, zeta8, zeta9): # zeta = np.array([zeta0, zeta1, zeta2, zeta3, zeta4, zeta5, zeta6, zeta7, zeta8, zeta9]) def objective(zeta0, zeta1, zeta2): zeta = np.array([zeta0, zeta1, zeta2]) # return rollout.mab_rollout_with_fixed_simulations(zeta, policies.mab_frequentist_ts_policy, T, # policies.expit_epsilon_decay, sim_env, **rollout_function_kwargs) return rollout.normal_cb_rollout_with_fixed_simulations( zeta, policies.linear_cb_epsilon_greedy_policy, T, policies.expit_epsilon_decay, sim_env, **rollout_function_kwargs) # bounds = {'zeta{}'.format(i): (0.0, 1.0) for i in range(10)} # explore_ = {'zeta{}'.format(i): [0.0] for i in range(10)} explore_ = { 'zeta0': [1.0, 0.05, 1.0, 0.1], 'zeta1': [50.0, 49.0, 1.0, 49.0], 'zeta2': [0.1, 2.5, 1.0, 2.5] } bounds = {'zeta0': (0.8, 2.0), 'zeta1': (1.0, 49.0), 'zeta2': (0.01, 2.5)} bo = BayesianOptimization(objective, bounds) bo.explore(explore_) bo.maximize(init_points=10, n_iter=20, alpha=1e-4) best_param = bo.res['max']['max_params'] best_param = np.array([best_param['zeta{}'.format(i)] for i in range(3)]) return best_param
def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean, tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps): score = 0 rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list, context_mean=estimated_context_mean, context_var=estimated_context_variance) for rep in range(monte_carlo_reps): rollout_env.reset() episode_score = 0 # Initial assignments for t in range(10): for j in range(5): rollout_env.step(0) for j in range(5): rollout_env.step(1) for time in range(time_horizon): beta_hat = rollout_env.beta_hat_list sampling_cov_list = rollout_env.sampling_cov_list for j in range(nPatients): # Draw context and take action # context = context_sequence[time - current_time][j] action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function, tuning_function_parameter, time_horizon, time, env) expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context) optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context) for a in range(rollout_env.number_of_actions)]) rollout_env.step(action) # Update regret regret = (expected_reward - optimal_expected_reward) episode_score += regret print(rep) score += (episode_score - score) / (rep + 1) return score
def episode(policy_name, label, n_patients=15, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]), context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]), list_of_reward_vars=[1, 1], T=50, mc_replicates=1000, pre_simulate=True): np.random.seed(label) # ToDo: Create policy class that encapsulates this behavior posterior_sample = True bootstrap_posterior = False positive_zeta = False if policy_name == 'eps': tuning_function = lambda a, b, c: 0.1 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'random': tuning_function = lambda a, b, c: 1.0 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'eps-decay-fixed': tuning_function = tuned_bandit.expit_epsilon_decay policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = np.array([0.8, 46.38, 1.857]) elif policy_name == 'eps-decay': tuning_function = tuned_bandit.expit_epsilon_decay policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = True explore_ = { 'zeta0': [1.0, 0.05, 1.0, 0.1], 'zeta1': [30.0, 0.0, 1.0, 0.0], 'zeta2': [0.1, 1.0, 0.01, 1.0] } bounds = { 'zeta0': (0.025, 2.0), 'zeta1': (0.0, 30.0), 'zeta2': (0.01, 2) } tuning_function_parameter = np.array([0.05, 1.0, 0.01]) posterior_sample = True elif policy_name == 'greedy': tuning_function = lambda a, b, c: 0.00 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'worst': tuning_function = lambda a, b, c: 0.00 policy = ref.linear_cb_worst_policy tune = False tuning_function_parameter = None elif policy_name == 'ts': tuning_function = lambda a, b, c: 1.0 # No shrinkage policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = False tuning_function_parameter = None elif policy_name == 'ts-decay-posterior-sample': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = True tuning_function_parameter = np.ones(10) * 0.1 posterior_sample = True elif policy_name == 'ts-decay-bootstrap-sample': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = True tuning_function_parameter = np.ones(10) * 0.1 posterior_sample = True bootstrap_posterior = True elif policy_name == 'ts-decay': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = True tuning_function_parameter = np.ones(10) * 0.1 elif policy_name == 'ucb-tune-posterior-sample': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_ucb_policy tune = True tuning_function_parameter = np.ones(10) * 0.025 posterior_sample = True # elif policy_name == 'ts-shrink': # tuning_function = tuned_bandit.expit_truncate # policy = tuned_bandit.thompson_sampling_policy # tune = True # tuning_function_parameter = np.array([-2, 1]) else: raise ValueError('Incorrect policy name') env = NormalCB(list_of_reward_betas=list_of_reward_betas, context_mean=context_mean, context_var=context_var, list_of_reward_vars=list_of_reward_vars) # env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25]) cumulative_regret = 0.0 # env.reset() tuning_parameter_sequence = [] rewards = [] actions = [] # Using pre-simulated data # data_for_episode = env.generate_mc_samples(1, T) # rep_dict = data_for_episode[0] # initial_linear_model = rep_dict['initial_linear_model'] # beta_hat_list = initial_linear_model['beta_hat_list'] # Xprime_X_list = initial_linear_model['Xprime_X_list'] # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list'] # X_list = initial_linear_model['X_list'] # y_list = initial_linear_model['y_list'] # X_dot_y_list = initial_linear_model['X_dot_y_list'] # sampling_cov_list = initial_linear_model['sampling_cov_list'] # sigma_hat_list = initial_linear_model['sigma_hat_list'] # context_sequence = rep_dict['contexts'] # regrets_sequence = rep_dict['regrets'] # rewards_sequence = rep_dict['rewards'] for t in range(T): X = env.X estimated_context_mean = np.mean(X, axis=0) estimated_context_variance = np.cov(X, rowvar=False) estimated_context_bounds = (np.min(X), np.max(X[:, 1:])) if tune: if pre_simulate: if posterior_sample: gen_model_parameters = [] for rep in range(mc_replicates): if bootstrap_posterior: pass else: draws = env.sample_from_posterior() # draws = env.sample_from_sampling_dist() betas_for_each_action = [] vars_for_each_action = [] for a in range(env.number_of_actions): beta_a = draws[a]['beta_draw'] var_a = draws[a]['var_draw'] betas_for_each_action.append(beta_a) vars_for_each_action.append(var_a) param_dict = { 'reward_betas': betas_for_each_action, 'reward_vars': vars_for_each_action, 'context_mean': draws['context_mu_draw'], 'context_var': draws['context_var_draw'] } # 'context_max': draws['context_max']} gen_model_parameters.append(param_dict) else: gen_model_parameters = None # sim_env = NormalUniformCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list, # context_bounds=estimated_context_bounds) sim_env = NormalCB(list_of_reward_betas=list_of_reward_betas, context_mean=context_mean, context_var=context_var, list_of_reward_vars=list_of_reward_vars) pre_simulated_data = sim_env.generate_mc_samples( mc_replicates, T, n_patients=n_patients, gen_model_params=gen_model_parameters) tuning_function_parameter = opt.bayesopt( rollout.normal_cb_rollout_with_fixed_simulations, policy, tuning_function, tuning_function_parameter, T, sim_env, mc_replicates, {'pre_simulated_data': pre_simulated_data}, bounds, explore_, positive_zeta=positive_zeta) tuning_parameter_sequence.append( [float(z) for z in tuning_function_parameter]) else: tuning_function_parameter = tuned_bandit.random_search( tuned_bandit.oracle_rollout, policy, tuning_function, tuning_function_parameter, linear_model_results, T, t, estimated_context_mean, estimated_context_variance, env) for patient in range(n_patients): x = copy.copy(env.curr_context) beta_hat = np.array([ env.posterior_params_dict[a]['beta_post'] for a in range(env.number_of_actions) ]) # print(env.posterior_params_dict) action = policy(beta_hat, env.sampling_cov_list, x, tuning_function, tuning_function_parameter, T, t, env) res = env.step(action) cumulative_regret += -env.regret(action, x) actions.append(action) u = res['Utility'] rewards.append(u) print(beta_hat) if t == 0: break return { 'cumulative_regret': cumulative_regret, 'zeta_sequence': tuning_parameter_sequence, 'rewards': rewards, 'actions': actions }
def episode(policy_name, label, save=False, points_per_grid_dimension=50, monte_carlo_reps=100): if save: base_name = 'mhealth-{}-{}'.format(label, policy_name) prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name) suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S") filename = '{}_{}.yml'.format(prefix, suffix) np.random.seed(label) T = 10 # ToDo: Create policy class that encapsulates this behavior if policy_name == 'eps': tuning_function = lambda a, b, c: 0.05 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'eps-decay-fixed': tuning_function = lambda a, t, c: 0.5 / (t + 1) policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'eps-decay': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = True tuning_function_parameter = np.ones(10) * 0.025 elif policy_name == 'greedy': tuning_function = lambda a, b, c: 0.00 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'worst': tuning_function = lambda a, b, c: 0.00 policy = ref.linear_cb_worst_policy tune = False tuning_function_parameter = None elif policy_name == 'ts': tuning_function = lambda a, b, c: 1.0 # No shrinkage policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = False tuning_function_parameter = None # elif policy_name == 'ts-shrink': # tuning_function = tuned_bandit.expit_truncate # policy = tuned_bandit.thompson_sampling_policy # tune = True # tuning_function_parameter = np.array([-2, 1]) else: raise ValueError('Incorrect policy name') env = NormalCB( list_of_reward_betas=[np.array([1.0, 1.0]), np.array([2.0, -2.0])]) cumulative_regret = 0.0 nPatients = 10 env.reset() # Initial assignments for t in range(10): for j in range(5): env.step(0) for j in range(5): env.step(1) for t in range(T): X = env.X estimated_context_mean = np.mean(X, axis=0) estimated_context_variance = np.cov(X, rowvar=False) if tune: tuning_function_parameter = opt.bayesopt( rollout.mHealth_rollout, policy, tuning_function, tuning_function_parameter, T, estimated_context_mean, estimated_context_variance, env, nPatients, points_per_grid_dimension, monte_carlo_reps) # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter))) for j in range(nPatients): x = copy.copy(env.curr_context) beta_hat = env.beta_hat_list action = policy(beta_hat, env.sampling_cov_list, x, tuning_function, tuning_function_parameter, T, t, env) env.step(action) # Compute regret expected_rewards = [ env.expected_reward(a, env.curr_context) for a in range(env.number_of_actions) ] expected_reward_at_action = expected_rewards[action] optimal_expected_reward = np.max(expected_rewards) regret = optimal_expected_reward - expected_reward_at_action cumulative_regret += regret # Save results if save: results = {'t': float(t), 'regret': float(cumulative_regret)} with open(filename, 'w') as outfile: yaml.dump(results, outfile) return cumulative_regret
def episode(label, tuning_function_parameter, n_patients=1, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]), context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]), list_of_reward_vars=[1, 1], T=30): tuning_function = tuned_bandit.expit_epsilon_decay policy = tuned_bandit.linear_cb_epsilon_greedy_policy env = NormalCB(1, list_of_reward_betas=list_of_reward_betas, context_mean=context_mean, context_var=context_var, list_of_reward_vars=list_of_reward_vars) # env = NormalUniformCB(list_of_reward_betas=[np.ones(10) + 0.05, np.ones(10)], list_of_reward_vars=[0.01, 25]) cumulative_regret = 0.0 # env.reset() print('epsilon', tuning_function(T, 0, tuning_function_parameter)) tuning_parameter_sequence = [] rewards = [] actions = [] # Using pre-simulated data # data_for_episode = env.generate_mc_samples(1, T) # rep_dict = data_for_episode[0] # initial_linear_model = rep_dict['initial_linear_model'] # beta_hat_list = initial_linear_model['beta_hat_list'] # Xprime_X_list = initial_linear_model['Xprime_X_list'] # Xprime_X_inv_list = initial_linear_model['Xprime_X_inv_list'] # X_list = initial_linear_model['X_list'] # y_list = initial_linear_model['y_list'] # X_dot_y_list = initial_linear_model['X_dot_y_list'] # sampling_cov_list = initial_linear_model['sampling_cov_list'] # sigma_hat_list = initial_linear_model['sigma_hat_list'] # context_sequence = rep_dict['contexts'] # regrets_sequence = rep_dict['regrets'] # rewards_sequence = rep_dict['rewards'] for t in range(T): X = env.X for patient in range(n_patients): x = copy.copy(env.curr_context) beta_hat = np.array([ env.posterior_params_dict[a]['beta_post'] for a in range(env.number_of_actions) ]) # print(env.posterior_params_dict) action = policy(beta_hat, env.sampling_cov_list, x, tuning_function, tuning_function_parameter, T, t, env) res = env.step(action) cumulative_regret += -env.regret(action, x) actions.append(int(action)) u = res['Utility'] rewards.append(float(u)) return { 'cumulative_regret': float(cumulative_regret), 'rewards': rewards, 'actions': actions }
# for i in range(96): # tuning_function_parameter = doc['zeta_sequences'][i][t] ## print(i, t, tuning_function_parameter) # res = episode('eps-decay-fixed', 0, tuning_function_parameter=tuning_function_parameter, T=50) ## print(i, t, res['cumulative_regret']) # cumulative_regret += (res['cumulative_regret'] - cumulative_regret)/(i+1) # cumulative_regret_se = np.append(cumulative_regret_se, cumulative_regret) # cumulative_regret_different_t = np.append(cumulative_regret_different_t, cumulative_regret) # cumulative_regret_se_different_t = np.append(cumulative_regret_se_different_t, np.std(cumulative_regret_se)/np.sqrt(96)) # print(t, cumulative_regret_different_t , cumulative_regret_se_different_t ) best_para = dict() for N in [5, 10, 15, 20, 25]: env = NormalCB(num_initial_pulls=N, list_of_reward_betas=[[-10, 0.4, 0.4, -0.4], [-9.8, 0.6, 0.6, -0.4]], context_mean=np.array([0.0, 0.0, 0.0]), context_var=np.array([[1.0, 0, 0], [0, 1., 0], [0, 0, 1.]]), list_of_reward_vars=[1, 1]) sigma_sq_hat_list = [env.sigma_hat_list[a]**2 for a in range(2)] p = bayes_optimize_zeta(0, num_initial_pulls=N, list_of_reward_betas=env.beta_hat_list, context_mean=np.mean(env.X[:, -3:], axis=0), context_var=np.cov(env.X[:, -3:], rowvar=False), list_of_reward_vars=sigma_sq_hat_list, mc_rep=1000, T=50) print(p) best_para[str(N)] = p