Esempio n. 1
0
    def process(i: int):
        # sample new data of synthetic logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # simulate the evaluation policy
        action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                            policy=evaluation_policy)
        # estimate the ground-truth policy values of the evaluation policy
        # by Monte-Carlo Simulation using p(r|x,a), the reward distribution
        ground_truth_policy_value = calc_ground_truth_policy_value(
            bandit_feedback=bandit_feedback,
            reward_sampler=dataset.sample_reward,  # p(r|x,a)
            policy=evaluation_policy,
            n_sim=n_sim,  # the number of simulations
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
        )

        return relative_ee_i
Esempio n. 2
0
def test_meta_evaluate_performance_of_estimators(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    gt = 0.5
    # calculate relative-ee
    eval_metric_ope_dict = {
        "ipw": np.abs((mock_policy_value + ipw.eps - gt) / gt),
        "ipw3": np.abs((mock_policy_value + ipw3.eps - gt) / gt),
    }
    # check performance estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    performance = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=gt,
        action_dist=random_action_dist,
        metric="relative-ee",
    )
    for k, v in performance.items():
        assert k in eval_metric_ope_dict, "Invalid key of performance response"
        assert v == eval_metric_ope_dict[
            k], "Invalid value of performance response"
    # zero division error when using relative-ee
    with pytest.raises(ZeroDivisionError, match=r"float division by zero"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=0.0,
            action_dist=random_action_dist,
            metric="relative-ee",
        )
    # check summarization
    performance_df = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=gt,
        action_dist=random_action_dist,
        metric="relative-ee",
    )
    assert_frame_equal(
        performance_df,
        pd.DataFrame(eval_metric_ope_dict,
                     index=["relative-ee"
                            ]).T), "Invalid summarization (performance)"
Esempio n. 3
0
    def process(i: int):
        # split the original data into training and evaluation sets
        dataset.split_train_eval(eval_size=eval_size, random_state=i)
        # obtain logged bandit feedback generated by behavior policy
        bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i)
        # obtain action choice probabilities by an evaluation policy
        action_dist = dataset.obtain_action_dist_by_eval_policy(
            base_classifier_e=base_model_dict[base_model_for_evaluation_policy]
            (**hyperparams[base_model_for_evaluation_policy]),
            alpha_e=alpha_e,
        )
        # calculate the ground-truth performance of the evaluation policy
        ground_truth_policy_value = dataset.calc_ground_truth_policy_value(
            action_dist=action_dist)
        # estimate the mean reward function of the evaluation set of multi-class classification data with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            reward=bandit_feedback["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i