def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean, tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps): score = 0 rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list, context_mean=estimated_context_mean, context_var=estimated_context_variance) for rep in range(monte_carlo_reps): rollout_env.reset() episode_score = 0 # Initial assignments for t in range(10): for j in range(5): rollout_env.step(0) for j in range(5): rollout_env.step(1) for time in range(time_horizon): beta_hat = rollout_env.beta_hat_list sampling_cov_list = rollout_env.sampling_cov_list for j in range(nPatients): # Draw context and take action # context = context_sequence[time - current_time][j] action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function, tuning_function_parameter, time_horizon, time, env) expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context) optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context) for a in range(rollout_env.number_of_actions)]) rollout_env.step(action) # Update regret regret = (expected_reward - optimal_expected_reward) episode_score += regret print(rep) score += (episode_score - score) / (rep + 1) return score
def episode(policy_name, label, save=False, points_per_grid_dimension=50, monte_carlo_reps=100): if save: base_name = 'mhealth-{}-{}'.format(label, policy_name) prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name) suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S") filename = '{}_{}.yml'.format(prefix, suffix) np.random.seed(label) T = 10 # ToDo: Create policy class that encapsulates this behavior if policy_name == 'eps': tuning_function = lambda a, b, c: 0.05 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'eps-decay-fixed': tuning_function = lambda a, t, c: 0.5 / (t + 1) policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'eps-decay': tuning_function = tuned_bandit.stepwise_linear_epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = True tuning_function_parameter = np.ones(10) * 0.025 elif policy_name == 'greedy': tuning_function = lambda a, b, c: 0.00 # Constant epsilon policy = tuned_bandit.linear_cb_epsilon_greedy_policy tune = False tuning_function_parameter = None elif policy_name == 'worst': tuning_function = lambda a, b, c: 0.00 policy = ref.linear_cb_worst_policy tune = False tuning_function_parameter = None elif policy_name == 'ts': tuning_function = lambda a, b, c: 1.0 # No shrinkage policy = tuned_bandit.linear_cb_thompson_sampling_policy tune = False tuning_function_parameter = None # elif policy_name == 'ts-shrink': # tuning_function = tuned_bandit.expit_truncate # policy = tuned_bandit.thompson_sampling_policy # tune = True # tuning_function_parameter = np.array([-2, 1]) else: raise ValueError('Incorrect policy name') env = NormalCB( list_of_reward_betas=[np.array([1.0, 1.0]), np.array([2.0, -2.0])]) cumulative_regret = 0.0 nPatients = 10 env.reset() # Initial assignments for t in range(10): for j in range(5): env.step(0) for j in range(5): env.step(1) for t in range(T): X = env.X estimated_context_mean = np.mean(X, axis=0) estimated_context_variance = np.cov(X, rowvar=False) if tune: tuning_function_parameter = opt.bayesopt( rollout.mHealth_rollout, policy, tuning_function, tuning_function_parameter, T, estimated_context_mean, estimated_context_variance, env, nPatients, points_per_grid_dimension, monte_carlo_reps) # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter))) for j in range(nPatients): x = copy.copy(env.curr_context) beta_hat = env.beta_hat_list action = policy(beta_hat, env.sampling_cov_list, x, tuning_function, tuning_function_parameter, T, t, env) env.step(action) # Compute regret expected_rewards = [ env.expected_reward(a, env.curr_context) for a in range(env.number_of_actions) ] expected_reward_at_action = expected_rewards[action] optimal_expected_reward = np.max(expected_rewards) regret = optimal_expected_reward - expected_reward_at_action cumulative_regret += regret # Save results if save: results = {'t': float(t), 'regret': float(cumulative_regret)} with open(filename, 'w') as outfile: yaml.dump(results, outfile) return cumulative_regret