def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_synthetic_obtain_batch_bandit_feedback(): # n_rounds with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds=0) with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds="3") # bandit feedback n_rounds = 10 n_actions = 5 dataset = SyntheticBanditDataset(n_actions=n_actions) bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) assert bandit_feedback["n_rounds"] == n_rounds assert bandit_feedback["n_actions"] == n_actions assert (bandit_feedback["context"].shape[0] == n_rounds # n_rounds and bandit_feedback["context"].shape[1] == 1 # default dim_context ) assert (bandit_feedback["action_context"].shape[0] == n_actions and bandit_feedback["action_context"].shape[1] == n_actions) assert (bandit_feedback["action"].ndim == 1 and len(bandit_feedback["action"]) == n_rounds) assert (bandit_feedback["position"].ndim == 1 and len(bandit_feedback["position"]) == n_rounds) assert (bandit_feedback["reward"].ndim == 1 and len(bandit_feedback["reward"]) == n_rounds) assert (bandit_feedback["expected_reward"].shape[0] == n_rounds and bandit_feedback["expected_reward"].shape[1] == n_actions) assert (bandit_feedback["pscore"].ndim == 1 and len(bandit_feedback["pscore"]) == n_rounds)
def process(i: int): # synthetic data generator with uniformly random policy dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=None, # uniformly random random_state=i, ) # sample new data of synthetic logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # simulate the evaluation policy action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=evaluation_policy) # estimate the ground-truth policy values of the evaluation policy # by Monte-Carlo Simulation using p(r|x,a), the reward distribution ground_truth_policy_value = calc_ground_truth_policy_value( bandit_feedback=bandit_feedback, reward_sampler=dataset.sample_reward, # p(r|x,a) policy=evaluation_policy, n_sim=n_sim, # the number of simulations ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) metric_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, ) return metric_i
def test_synthetic_obtain_batch_bandit_feedback(): # n_rounds with pytest.raises(ValueError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds=0) with pytest.raises(TypeError): dataset = SyntheticBanditDataset(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds="3") # bandit feedback n_rounds = 10 n_actions = 5 for n_deficient_actions in [0, 2]: dataset = SyntheticBanditDataset( n_actions=n_actions, beta=0, n_deficient_actions=n_deficient_actions) bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) assert bandit_feedback["n_rounds"] == n_rounds assert bandit_feedback["n_actions"] == n_actions assert (bandit_feedback["context"].shape[0] == n_rounds # n_rounds and bandit_feedback["context"].shape[1] == 1 # default dim_context ) assert (bandit_feedback["action_context"].shape[0] == n_actions and bandit_feedback["action_context"].shape[1] == n_actions) assert (bandit_feedback["action"].ndim == 1 and len(bandit_feedback["action"]) == n_rounds) assert bandit_feedback["position"] is None assert (bandit_feedback["reward"].ndim == 1 and len(bandit_feedback["reward"]) == n_rounds) assert (bandit_feedback["expected_reward"].shape[0] == n_rounds and bandit_feedback["expected_reward"].shape[1] == n_actions) assert (bandit_feedback["pi_b"].shape[0] == n_rounds and bandit_feedback["pi_b"].shape[1] == n_actions) # when `beta=0`, behavior_policy should be uniform if n_deficient_actions == 0: uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions assert np.allclose(bandit_feedback["pi_b"], uniform_policy) assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1), np.ones(n_rounds)) assert (bandit_feedback["pi_b"] == 0 ).sum() == n_deficient_actions * n_rounds assert (bandit_feedback["pscore"].ndim == 1 and len(bandit_feedback["pscore"]) == n_rounds)
def synthetic_bandit_feedback() -> BanditFeedback: n_actions = 10 dim_context = 5 random_state = 12345 n_rounds = 10000 dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) return bandit_feedback
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_ipw_learner, gt_random_policy, gt_uniform_sample_weight_learner
InverseProbabilityWeighting(), SelfNormalizedInverseProbabilityWeighting(), DoublyRobust(), SelfNormalizedDoublyRobust(), SwitchDoublyRobust(), ] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(RandomForest(**hyperparams)) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_runs) for est in ope_estimators } for i in np.arange(n_runs): # sample a new set of logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=bandit_feedback, policy=policy) # estimate the ground-truth policy values of the counterfactual policy # using the full expected reward contained in the bandit feedback dictionary ground_truth_policy_value = bandit_feedback["expected_reward"][ np.arange(n_rounds), selected_actions.flatten()].mean() # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, action_context=dataset.action_context, regression_model=RegressionModel(base_model=base_model), ope_estimators=ope_estimators, )
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) ope_estimator = DoublyRobust() # define evaluation policy using NNPolicyLearner nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator.estimate_policy_value_tensor, ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=12345, ) # train the evaluation policy on the training set of the synthetic logged bandit feedback nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback nn_policy_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_nn_policy_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
activation = args.activation solver = args.solver batch_size = args.batch_size if args.batch_size else "auto" early_stopping = args.early_stopping random_state = args.random_state # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state,
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # defining policy learners ipw_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) q_policy = QLearner( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective="ipw", ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # policy training ipw_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) q_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # prediction/making decisions ipw_action_dist = ipw_policy.predict( context=bandit_feedback_test["context"], ) q_action_dist = q_policy.predict( context=bandit_feedback_test["context"], ) nn_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # evaluation gt_ipw_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=ipw_action_dist, ) gt_q_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=q_action_dist, ) gt_nn_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return ( gt_ipw_learner, gt_q_learner, gt_nn_learner, gt_random_policy, gt_uniform_sample_weight_learner, )