Beispiel #1
0
def mHealth_rollout(tuning_function_parameter, policy, time_horizon, estimated_context_mean,
                    tuning_function, estimated_context_variance, env, nPatients, monte_carlo_reps):

  score = 0
  rollout_env = NormalCB(list_of_reward_betas=env.beta_hat_list, list_of_reward_vars=env.sigma_hat_list,
                         context_mean=estimated_context_mean, context_var=estimated_context_variance)
  for rep in range(monte_carlo_reps):
    rollout_env.reset()
    episode_score = 0

    # Initial assignments
    for t in range(10):
      for j in range(5):
        rollout_env.step(0)
      for j in range(5):
        rollout_env.step(1)

    for time in range(time_horizon):
      beta_hat = rollout_env.beta_hat_list
      sampling_cov_list = rollout_env.sampling_cov_list
      for j in range(nPatients):
        # Draw context and take action
        # context = context_sequence[time - current_time][j]
        action = policy(beta_hat, sampling_cov_list, rollout_env.curr_context, tuning_function,
                        tuning_function_parameter, time_horizon, time, env)
        expected_reward = rollout_env.expected_reward(action, rollout_env.curr_context)
        optimal_expected_reward = np.max([rollout_env.expected_reward(a, rollout_env.curr_context)
                                          for a in range(rollout_env.number_of_actions)])
        rollout_env.step(action)

        # Update regret
        regret = (expected_reward - optimal_expected_reward)
        episode_score += regret

    print(rep)
    score += (episode_score - score) / (rep + 1)
  return score
Beispiel #2
0
def episode(policy_name,
            label,
            save=False,
            points_per_grid_dimension=50,
            monte_carlo_reps=100):
    if save:
        base_name = 'mhealth-{}-{}'.format(label, policy_name)
        prefix = os.path.join(project_dir, 'src', 'run', 'results', base_name)
        suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
        filename = '{}_{}.yml'.format(prefix, suffix)

    np.random.seed(label)
    T = 10

    # ToDo: Create policy class that encapsulates this behavior
    if policy_name == 'eps':
        tuning_function = lambda a, b, c: 0.05  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay-fixed':
        tuning_function = lambda a, t, c: 0.5 / (t + 1)
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'eps-decay':
        tuning_function = tuned_bandit.stepwise_linear_epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = True
        tuning_function_parameter = np.ones(10) * 0.025
    elif policy_name == 'greedy':
        tuning_function = lambda a, b, c: 0.00  # Constant epsilon
        policy = tuned_bandit.linear_cb_epsilon_greedy_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'worst':
        tuning_function = lambda a, b, c: 0.00
        policy = ref.linear_cb_worst_policy
        tune = False
        tuning_function_parameter = None
    elif policy_name == 'ts':
        tuning_function = lambda a, b, c: 1.0  # No shrinkage
        policy = tuned_bandit.linear_cb_thompson_sampling_policy
        tune = False
        tuning_function_parameter = None
    # elif policy_name == 'ts-shrink':
    #   tuning_function = tuned_bandit.expit_truncate
    #   policy = tuned_bandit.thompson_sampling_policy
    #   tune = True
    #   tuning_function_parameter = np.array([-2, 1])
    else:
        raise ValueError('Incorrect policy name')

    env = NormalCB(
        list_of_reward_betas=[np.array([1.0, 1.0]),
                              np.array([2.0, -2.0])])
    cumulative_regret = 0.0
    nPatients = 10
    env.reset()

    # Initial assignments
    for t in range(10):
        for j in range(5):
            env.step(0)
        for j in range(5):
            env.step(1)

    for t in range(T):
        X = env.X
        estimated_context_mean = np.mean(X, axis=0)
        estimated_context_variance = np.cov(X, rowvar=False)
        if tune:
            tuning_function_parameter = opt.bayesopt(
                rollout.mHealth_rollout, policy, tuning_function,
                tuning_function_parameter, T, estimated_context_mean,
                estimated_context_variance, env, nPatients,
                points_per_grid_dimension, monte_carlo_reps)
        # print('time {} epsilon {}'.format(t, tuning_function(T,t,tuning_function_parameter)))
        for j in range(nPatients):
            x = copy.copy(env.curr_context)

            beta_hat = env.beta_hat_list
            action = policy(beta_hat, env.sampling_cov_list, x,
                            tuning_function, tuning_function_parameter, T, t,
                            env)
            env.step(action)

            # Compute regret
            expected_rewards = [
                env.expected_reward(a, env.curr_context)
                for a in range(env.number_of_actions)
            ]
            expected_reward_at_action = expected_rewards[action]
            optimal_expected_reward = np.max(expected_rewards)
            regret = optimal_expected_reward - expected_reward_at_action
            cumulative_regret += regret

        # Save results
        if save:
            results = {'t': float(t), 'regret': float(cumulative_regret)}
            with open(filename, 'w') as outfile:
                yaml.dump(results, outfile)

    return cumulative_regret