def process(b: int): # sample bootstrap from batch logged bandit feedback bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # estimate the mean reward function with an ML model regression_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=obd.action_context, base_model=base_model_dict[base_model](**hyperparams[base_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], position=bandit_feedback["position"], pscore=bandit_feedback["pscore"], n_folds=3, # 3-fold cross-fitting ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) action_dist = np.tile(action_dist_single_round, (bandit_feedback["n_rounds"], 1, 1)) relative_ee_b = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_b
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_meta_estimation_format(synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the response format of OffPolicyEvaluation """ # single ope estimator ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]) assert ope_.estimate_policy_values(random_action_dist) == { "dm": mock_policy_value }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value" assert ope_.estimate_intervals(random_action_dist) == { "dm": mock_confidence_interval }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value" with pytest.raises(AssertionError, match=r"action_dist must be 3-dimensional.*"): ope_.estimate_policy_values( random_action_dist[:, :, 0] ), "action_dist must be 3-dimensional when using OffPolicyEvaluation" # multiple ope estimators ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm, ipw]) assert ope_.estimate_policy_values(random_action_dist) == { "dm": mock_policy_value, "ipw": mock_policy_value + ipw.eps, }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value" assert ope_.estimate_intervals(random_action_dist) == { "dm": mock_confidence_interval, "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"
def process(i: int): # sample new data of synthetic logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # simulate the evaluation policy action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=evaluation_policy) # estimate the ground-truth policy values of the evaluation policy # by Monte-Carlo Simulation using p(r|x,a), the reward distribution ground_truth_policy_value = calc_ground_truth_policy_value( bandit_feedback=bandit_feedback, reward_sampler=dataset.sample_reward, # p(r|x,a) policy=evaluation_policy, n_sim=n_sim, # the number of simulations ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, ) return relative_ee_i
def test_meta_evaluate_performance_of_estimators_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, metric, ground_truth_policy_value, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of evaluate_performance_of_estimators using invalid data """ ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]) with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) # estimate_intervals function is called in summarize_off_policy_estimates with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, )
def process(i: int): # synthetic data generator with uniformly random policy dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=None, # uniformly random random_state=i, ) # sample new data of synthetic logged bandit feedback bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # simulate the evaluation policy action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=evaluation_policy) # estimate the ground-truth policy values of the evaluation policy # by Monte-Carlo Simulation using p(r|x,a), the reward distribution ground_truth_policy_value = calc_ground_truth_policy_value( bandit_feedback=bandit_feedback, reward_sampler=dataset.sample_reward, # p(r|x,a) policy=evaluation_policy, n_sim=n_sim, # the number of simulations ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) metric_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, ) return metric_i
def test_meta_estimate_intervals_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, alpha, n_bootstrap_samples, random_state, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of estimate_intervals using invalid data """ ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]) with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) # estimate_intervals function is called in summarize_off_policy_estimates with pytest.raises(ValueError, match=f"{description_2}*"): _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, )
def test_meta_post_init(synthetic_bandit_feedback: BanditFeedback) -> None: """ Test the __post_init__ function """ # __post_init__ saves the latter estimator when the same estimator name is used ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw2]) assert ope_.ope_estimators_ == { "ipw": ipw2 }, "__post_init__ returns a wrong value" # __post_init__ can handle the same estimator if the estimator names are different ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) assert ope_.ope_estimators_ == { "ipw": ipw, "ipw3": ipw3, }, "__post_init__ returns a wrong value" # __post__init__ raises RuntimeError when necessary_keys are not included in the bandit_feedback necessary_keys = ["action", "position", "reward", "pscore"] for i in range(len(necessary_keys)): for deleted_keys in itertools.combinations(necessary_keys, i + 1): invalid_bandit_feedback_dict = {key: "_" for key in necessary_keys} # delete for k in deleted_keys: del invalid_bandit_feedback_dict[k] with pytest.raises(RuntimeError, match=r"Missing key*"): _ = OffPolicyEvaluation( bandit_feedback=invalid_bandit_feedback_dict, ope_estimators=[ipw])
def test_meta_evaluate_performance_of_estimators_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, metric, ground_truth_policy_value, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of evaluate_performance_of_estimators using valid data """ if metric == "relative-ee": # calculate relative-ee eval_metric_ope_dict = { "ipw": np.abs((mock_policy_value + ipw.eps - ground_truth_policy_value) / ground_truth_policy_value), "ipw3": np.abs((mock_policy_value + ipw3.eps - ground_truth_policy_value) / ground_truth_policy_value), } else: # calculate se eval_metric_ope_dict = { "ipw": (mock_policy_value + ipw.eps - ground_truth_policy_value)**2, "ipw3": (mock_policy_value + ipw3.eps - ground_truth_policy_value)**2, } # check performance estimators ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) performance = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) for k, v in performance.items(): assert k in eval_metric_ope_dict, "Invalid key of performance response" assert v == eval_metric_ope_dict[ k], "Invalid value of performance response" performance_df = ope_.summarize_estimators_comparison( ground_truth_policy_value=ground_truth_policy_value, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, action_dist=action_dist, metric=metric, ) assert_frame_equal( performance_df, pd.DataFrame(eval_metric_ope_dict, index=[metric]).T), "Invalid summarization (performance)"
def process(i: int): # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], tau=0.1, # temperature hyperparameter ) # estimate the ground-truth policy values of the evaluation policy # using the full expected reward contained in the test set of synthetic bandit feedback ground_truth_policy_value = np.average( bandit_feedback_test["expected_reward"], weights=action_dist[:, :, 0], axis=1, ).mean() # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, len_list=dataset.len_list, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_meta_estimate_policy_values_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of estimate_policy_values using valid data """ # single ope estimator ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm] ) ope_.is_model_dependent = True assert ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) == { "dm": mock_policy_value }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value" # multiple ope estimators ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm, ipw] ) ope_.is_model_dependent = True assert ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) == { "dm": mock_policy_value, "ipw": mock_policy_value + ipw.eps, }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value"
def process(b: int): # load the pre-trained regression model with open(reg_model_path / f"reg_model_{b}.pkl", "rb") as f: reg_model = pickle.load(f) with open(reg_model_path / f"reg_model_mrdr_{b}.pkl", "rb") as f: reg_model_mrdr = pickle.load(f) with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "rb") as f: is_for_reg_model = pickle.load(f) # sample bootstrap samples from batch logged bandit feedback if is_timeseries_split: bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b, ) else: bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b, ) bandit_feedback["n_rounds"] = (~is_for_reg_model).sum() for key_ in ["context", "action", "reward", "pscore", "position"]: bandit_feedback[key_] = bandit_feedback[key_][ ~is_for_reg_model] # estimate the mean reward function using the pre-trained reg_model estimated_rewards_by_reg_model_default = reg_model.predict( context=bandit_feedback["context"], ) estimated_rewards_by_reg_model_mrdr = reg_model_mrdr.predict( context=bandit_feedback["context"], ) estimated_rewards_by_reg_model = { estimator.estimator_name: estimated_rewards_by_reg_model_mrdr if estimator.estimator_name == "mrdr" else estimated_rewards_by_reg_model_default for estimator in ope_estimators } # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) action_dist = np.tile(action_dist_single_round, (bandit_feedback["n_rounds"], 1, 1)) relative_ee_b = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_b
def test_response_format_of_ope_estimators_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the response format of ope estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # test all estimators all_estimators = ope.__all_estimators__ estimators = [ getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators ] # conduct OPE ope_instance = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators) estimated_policy_value = ope_instance.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward) estimated_intervals = ope_instance.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward, random_state=12345, ) # check the format of OPE for key in estimated_policy_value: # check the keys of the output dictionary of the estimate_intervals method assert set(estimated_intervals[key].keys()) == set([ "mean", "95.0% CI (lower)", "95.0% CI (upper)" ]), f"Confidence interval of {key} has invalid keys" # check the relationship between the means and the confidence bounds estimated by OPE estimators assert ( estimated_intervals[key]["95.0% CI (lower)"] <= estimated_policy_value[key] ) and ( estimated_intervals[key]["95.0% CI (upper)"] >= estimated_policy_value[key] ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator" assert (estimated_intervals[key]["mean"] >= estimated_intervals[key]["95.0% CI (lower)"] ), f"Invalid confidence interval of {key}: lower bound > mean" assert (estimated_intervals[key]["mean"] <= estimated_intervals[key]["95.0% CI (upper)"] ), f"Invalid confidence interval of {key}: upper bound < mean"
def test_meta_create_estimator_inputs_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using invalid data """ ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw] ) estimator_inputs = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) assert set(estimator_inputs.keys()) == set(["ipw"]) assert set(estimator_inputs["ipw"].keys()) == set( [ "reward", "action", "pscore", "position", "action_dist", "estimated_rewards_by_reg_model", "estimated_pscore", "estimated_importance_weights", "p_e_a", "pi_b", "context", "action_embed", ] ), f"Invalid response of _create_estimator_inputs (test case: {description})" # _create_estimator_inputs function is called in the following functions _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )
def test_meta_post_init_format(synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the post init format of OffPolicyEvaluation """ # __post_init__ saves the latter estimator when the same estimator name is used ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw2]) assert ope_.ope_estimators_ == { "ipw": ipw2 }, "__post_init__ returns a wrong value" # __post_init__ can handle the same estimator if the estimator names are different ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) assert ope_.ope_estimators_ == { "ipw": ipw, "ipw3": ipw3, }, "__post_init__ returns a wrong value"
def test_meta_create_estimator_inputs_format( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the _create_estimator_inputs format of OffPolicyEvaluation """ # __post_init__ saves the latter estimator when the same estimator name is used ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw]) inputs = ope_._create_estimator_inputs(action_dist=None, estimated_rewards_by_reg_model=None) assert set(inputs.keys()) == set([ "reward", "action", "pscore", "position", "action_dist", "estimated_rewards_by_reg_model", ]), "Invalid response format of _create_estimator_inputs"
def test_performance_of_ope_estimators_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() gt_std = q_pi_e.std(ddof=1) # test most of the estimators (ReplayMethod is not tested because it is out of scope) all_estimators = ope.__all_estimators__ estimators = [ getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators if estimator_name not in ["ReplayMethod"] ] # conduct OPE ope_instance = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators) estimated_policy_value = ope_instance.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward) # check the performance of OPE ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0]) print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}") for key in estimated_policy_value: print( f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, " ) # test the performance of each estimator assert ( np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"
def process(i: int): # split the original data into training and evaluation sets dataset.split_train_eval(eval_size=eval_size, random_state=i) # obtain logged bandit feedback generated by behavior policy bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i) # obtain action choice probabilities by an evaluation policy action_dist = dataset.obtain_action_dist_by_eval_policy( base_classifier_e=base_model_dict[base_model_for_evaluation_policy] (**hyperparams[base_model_for_evaluation_policy]), alpha_e=alpha_e, ) # calculate the ground-truth performance of the evaluation policy ground_truth_policy_value = dataset.calc_ground_truth_policy_value( action_dist=action_dist) # estimate the mean reward function of the evaluation set of multi-class classification data with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_meta_evaluate_performance_of_estimators( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: gt = 0.5 # calculate relative-ee eval_metric_ope_dict = { "ipw": np.abs((mock_policy_value + ipw.eps - gt) / gt), "ipw3": np.abs((mock_policy_value + ipw3.eps - gt) / gt), } # check performance estimators ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) performance = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=gt, action_dist=random_action_dist, metric="relative-ee", ) for k, v in performance.items(): assert k in eval_metric_ope_dict, "Invalid key of performance response" assert v == eval_metric_ope_dict[ k], "Invalid value of performance response" # zero division error when using relative-ee with pytest.raises(ZeroDivisionError, match=r"float division by zero"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.0, action_dist=random_action_dist, metric="relative-ee", ) # check summarization performance_df = ope_.summarize_estimators_comparison( ground_truth_policy_value=gt, action_dist=random_action_dist, metric="relative-ee", ) assert_frame_equal( performance_df, pd.DataFrame(eval_metric_ope_dict, index=["relative-ee" ]).T), "Invalid summarization (performance)"
def test_meta_summarize_off_policy_estimates( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) value, interval = ope_.summarize_off_policy_estimates(random_action_dist) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_interval = pd.DataFrame({ "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, "ipw3": {k: v + ipw3.eps for k, v in mock_confidence_interval.items()}, }).T assert_frame_equal(value, expected_value), "Invalid summarization (policy value)" assert_frame_equal(interval, expected_interval), "Invalid summarization (interval)"
def test_meta_create_estimator_inputs_using_invalid_input_data( action_dist, estimated_rewards_by_reg_model, description: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the _create_estimator_inputs using valid data """ ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw] ) # raise ValueError when the shape of two arrays are different with pytest.raises(ValueError, match=f"{description}*"): _ = ope_._create_estimator_inputs( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) # _create_estimator_inputs function is called in the following functions with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.evaluate_performance_of_estimators( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) with pytest.raises(ValueError, match=f"{description}*"): _ = ope_.summarize_estimators_comparison( ground_truth_policy_value=0.1, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, )
def test_meta_estimate_intervals_using_valid_input_data( action_dist, estimated_rewards_by_reg_model, description_1: str, alpha: float, n_bootstrap_samples: int, random_state: int, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of estimate_intervals using valid data """ # single ope estimator ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm] ) assert ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) == { "dm": mock_confidence_interval }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value" # multiple ope estimators ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm, ipw] ) assert ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) == { "dm": mock_confidence_interval, "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod, IPW]) returns a wrong value"
def test_meta_estimated_rewards_by_reg_model_inputs( synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the estimate_policy_values/estimate_intervals functions wrt estimated_rewards_by_reg_model """ ope_ = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=[DirectMethod()] ) action_dist = np.zeros( (synthetic_bandit_feedback["n_rounds"], synthetic_bandit_feedback["n_actions"]) ) with pytest.raises(ValueError): ope_.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=None, ) with pytest.raises(ValueError): ope_.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=None, )
def test_performance_of_ope_estimators_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() # test most of the estimators (ReplayMethod is not tested because it is out of scope) all_estimators = ope.__all_estimators__ estimators_standard = [ getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators if estimator_name not in ["ReplayMethod"] ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ getattr(ope.estimators_tuning, estimator_name)( lambdas=[1, 100, 10000, np.inf], tuning_method=tuning_method, ) for estimator_name in all_estimators_tuning for tuning_method in ["slope", "mse"] ] all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ estimators_tuning_sg = [ getattr(ope.estimators_tuning, estimator_name)( lambdas=[0.001, 0.01, 0.1, 1.0], tuning_method=tuning_method, ) for estimator_name in all_estimators_tuning_sg for tuning_method in ["slope", "mse"] ] estimators = estimators_standard + estimators_tuning + estimators_tuning_sg # skip estimation estimated_pscore = None estimated_importance_weights = ( random_action_dist[ np.arange(synthetic_bandit_feedback["action"].shape[0]), synthetic_bandit_feedback["action"], np.zeros( synthetic_bandit_feedback["action"].shape[0], dtype=int ), # position is None ] / synthetic_bandit_feedback["pscore"] ) # conduct OPE ope_instance = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators ) estimated_policy_value = ope_instance.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights, ) # check the performance of OPE print(f"gt_mean: {gt_mean}") for key in estimated_policy_value: print( f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, " ) # test the performance of each estimator assert ( np.abs(gt_mean - estimated_policy_value[key]) / gt_mean <= 0.1 ), f"OPE of {key} did not work well (relative absolute error is greater than 10%)"
# ground-truth policy value of the random policy # , which is the empirical mean of the factual (observed) rewards (on-policy estimation) ground_truth = bandit_feedback["reward"].mean() # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV( HistGradientBoostingClassifier(**hyperparams)) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=policy) # estimate the policy value of a given counterfactual algorithm by the three OPE estimators. ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, regression_model=RegressionModel(base_model=base_model), action_context=obd.action_context, ope_estimators=[ InverseProbabilityWeighting(), DirectMethod(), DoublyRobust() ], ) estimated_policy_value, estimated_interval = ope.summarize_off_policy_estimates( selected_actions=selected_actions) # calculate estimated policy value relative to that of the behavior policy print("=" * 70) print(f"random_state={random_state}: counterfactual policy={policy_name}") print("-" * 70) estimated_policy_value["relative_estimated_policy_value"] = ( estimated_policy_value.estimated_policy_value / ground_truth) print(estimated_policy_value) print("=" * 70)
def process(i: int): # synthetic data generator dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, beta=3.0, n_cat_dim=3, n_cat_per_dim=5, reward_function=logistic_reward_function, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_iw_estimator]( **hyperparams[base_model_for_iw_estimator]), ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit data evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit data action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], ) # estimate the reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=2, random_state=12345, ) # fit propensity score estimators pscore_estimator = PropensityScoreEstimator( len_list=1, n_actions=n_actions, base_model=base_model_dict[base_model_for_pscore_estimator]( **hyperparams[base_model_for_pscore_estimator]), calibration_cv=3, ) estimated_pscore = pscore_estimator.fit_predict( action=bandit_feedback_test["action"], position=bandit_feedback_test["position"], context=bandit_feedback_test["context"], n_folds=3, random_state=12345, ) # fit importance weight estimators estimated_importance_weights_dict = {} for clf_name, clf_arguments in bipw_model_configurations.items(): clf = ImportanceWeightEstimator( len_list=1, n_actions=n_actions, fitting_method=clf_arguments["fitting_method"], base_model=clf_arguments["base_model"], ) estimated_importance_weights_dict[clf_name] = clf.fit_predict( action=bandit_feedback_test["action"], context=bandit_feedback_test["context"], action_dist=action_dist, position=bandit_feedback_test["position"], n_folds=2, evaluate_model_performance=False, random_state=12345, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators + [ MarginalizedInverseProbabilityWeighting(n_actions=n_actions, estimator_name="mipw"), MarginalizedInverseProbabilityWeighting( n_actions=n_actions, embedding_selection_method="greedy", estimator_name="mipw (greedy selection)", ), SelfNormalizedMarginalizedInverseProbabilityWeighting( n_actions=n_actions, estimator_name="snmipw"), ], ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights_dict, action_embed=bandit_feedback_test["action_embed"], pi_b=bandit_feedback_test["pi_b"], metric="relative-ee", ) return relative_ee_i
) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators } for b in np.arange(n_boot_samples): # sample bootstrap from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=boot_bandit_feedback, policy=policy ) # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=boot_bandit_feedback, action_context=obd.action_context, regression_model=RegressionModel(base_model=base_model), ope_estimators=ope_estimators, ) relative_estimation_errors = ope.evaluate_performance_of_estimators( selected_actions=selected_actions, ground_truth_policy_value=ground_truth_policy_value, ) policy.initialize() # store relative estimation errors of OPE estimators at each split for ( estimator_name, relative_estimation_error, ) in relative_estimation_errors.items(): evaluation_of_ope_results[estimator_name][b] = relative_estimation_error # estimate confidence intervals of relative estimation by nonparametric bootstrap method
def test_response_format_of_ope_estimators_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the response format of ope estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # test all estimators all_estimators = ope.__all_estimators__ estimators_standard = [ getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators ] all_estimators_tuning = ope.__all_estimators_tuning__ estimators_tuning = [ getattr(ope.estimators_tuning, estimator_name)( lambdas=[1, 100, 10000, np.inf], tuning_method=tuning_method, ) for estimator_name in all_estimators_tuning for tuning_method in ["slope", "mse"] ] all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__ estimators_tuning_sg = [ getattr(ope.estimators_tuning, estimator_name)( lambdas=[0.001, 0.01, 0.1, 1.0], tuning_method=tuning_method, ) for estimator_name in all_estimators_tuning_sg for tuning_method in ["slope", "mse"] ] estimators = estimators_standard + estimators_tuning + estimators_tuning_sg # skip estimation estimated_pscore = None estimated_importance_weights = ( random_action_dist[ np.arange(synthetic_bandit_feedback["action"].shape[0]), synthetic_bandit_feedback["action"], np.zeros( synthetic_bandit_feedback["action"].shape[0], dtype=int ), # position is None ] / synthetic_bandit_feedback["pscore"] ) # conduct OPE ope_instance = OffPolicyEvaluation( bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators ) estimated_policy_value = ope_instance.estimate_policy_values( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights, ) estimated_intervals = ope_instance.estimate_intervals( action_dist=action_dist, estimated_rewards_by_reg_model=expected_reward, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights, random_state=12345, ) # check the format of OPE for key in estimated_policy_value: # check the keys of the output dictionary of the estimate_intervals method assert set(estimated_intervals[key].keys()) == set( ["mean", "95.0% CI (lower)", "95.0% CI (upper)"] ), f"Confidence interval of {key} has invalid keys" # check the relationship between the means and the confidence bounds estimated by OPE estimators assert ( estimated_intervals[key]["95.0% CI (lower)"] <= estimated_policy_value[key] ) and ( estimated_intervals[key]["95.0% CI (upper)"] >= estimated_policy_value[key] ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator" assert ( estimated_intervals[key]["mean"] >= estimated_intervals[key]["95.0% CI (lower)"] ), f"Invalid confidence interval of {key}: lower bound > mean" assert ( estimated_intervals[key]["mean"] <= estimated_intervals[key]["95.0% CI (upper)"] ), f"Invalid confidence interval of {key}: upper bound < mean"
def test_meta_summarize_off_policy_estimates( action_dist, estimated_rewards_by_reg_model, description_1: str, alpha: float, n_bootstrap_samples: int, random_state: int, description_2: str, synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the response of summarize_off_policy_estimates using valid data """ ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw, ipw3]) value, interval = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_value["relative_estimated_policy_value"] = ( expected_value["estimated_policy_value"] / synthetic_bandit_feedback["reward"].mean()) expected_interval = pd.DataFrame({ "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()}, "ipw3": {k: v + ipw3.eps for k, v in mock_confidence_interval.items()}, }).T assert_frame_equal(value, expected_value), "Invalid summarization (policy value)" assert_frame_equal(interval, expected_interval), "Invalid summarization (interval)" # check relative estimated policy value when the average of bandit_feedback["reward"] is zero zero_reward_bandit_feedback = deepcopy(synthetic_bandit_feedback) zero_reward_bandit_feedback["reward"] = np.zeros( zero_reward_bandit_feedback["reward"].shape[0]) ope_ = OffPolicyEvaluation(bandit_feedback=zero_reward_bandit_feedback, ope_estimators=[ipw, ipw3]) value, _ = ope_.summarize_off_policy_estimates( action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, alpha=alpha, n_bootstrap_samples=n_bootstrap_samples, random_state=random_state, ) expected_value = pd.DataFrame( { "ipw": mock_policy_value + ipw.eps, "ipw3": mock_policy_value + ipw3.eps, }, index=["estimated_policy_value"], ).T expected_value["relative_estimated_policy_value"] = np.nan assert_frame_equal(value, expected_value), "Invalid summarization (policy value)"
action_dist = policy.compute_batch_action_dist( n_sim=100000, n_rounds=boot_bandit_feedback["n_rounds"]) else: policy = Random( n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state, ) action_dist = policy.compute_batch_action_dist( n_sim=100000, n_rounds=boot_bandit_feedback["n_rounds"]) # estimate the mean reward function using the pre-trained reg_model estimated_rewards_by_reg_model = reg_model.predict( context=boot_bandit_feedback["context"], ) # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=boot_bandit_feedback, ope_estimators=ope_estimators, ) relative_estimation_errors = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) # store relative estimation errors of OPE estimators at each bootstrap for ( estimator_name, relative_estimation_error, ) in relative_estimation_errors.items(): relative_ee[estimator_name][b] = relative_estimation_error print( f"{b+1}th iteration: {np.round((time.time() - start) / 60, 2)}min")