def process(b: int):
        # sample bootstrap from batch logged bandit feedback
        bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
        # estimate the mean reward function with an ML model
        regression_model = RegressionModel(
            n_actions=obd.n_actions,
            len_list=obd.len_list,
            action_context=obd.action_context,
            base_model=base_model_dict[base_model](**hyperparams[base_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            reward=bandit_feedback["reward"],
            position=bandit_feedback["position"],
            pscore=bandit_feedback["pscore"],
            n_folds=3,  # 3-fold cross-fitting
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        action_dist = np.tile(action_dist_single_round,
                              (bandit_feedback["n_rounds"], 1, 1))
        relative_ee_b = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_b
Ejemplo n.º 2
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Ejemplo n.º 3
0
def test_meta_estimation_format(synthetic_bandit_feedback: BanditFeedback,
                                random_action_dist: np.ndarray) -> None:
    """
    Test the response format of OffPolicyEvaluation
    """
    # single ope estimator
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm])
    assert ope_.estimate_policy_values(random_action_dist) == {
        "dm": mock_policy_value
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value"
    assert ope_.estimate_intervals(random_action_dist) == {
        "dm": mock_confidence_interval
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"
    with pytest.raises(AssertionError,
                       match=r"action_dist must be 3-dimensional.*"):
        ope_.estimate_policy_values(
            random_action_dist[:, :, 0]
        ), "action_dist must be 3-dimensional when using OffPolicyEvaluation"
    # multiple ope estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm, ipw])
    assert ope_.estimate_policy_values(random_action_dist) == {
        "dm": mock_policy_value,
        "ipw": mock_policy_value + ipw.eps,
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value"
    assert ope_.estimate_intervals(random_action_dist) == {
        "dm": mock_confidence_interval,
        "ipw": {k: v + ipw.eps
                for k, v in mock_confidence_interval.items()},
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"
Ejemplo n.º 4
0
    def process(i: int):
        # sample new data of synthetic logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # simulate the evaluation policy
        action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                            policy=evaluation_policy)
        # estimate the ground-truth policy values of the evaluation policy
        # by Monte-Carlo Simulation using p(r|x,a), the reward distribution
        ground_truth_policy_value = calc_ground_truth_policy_value(
            bandit_feedback=bandit_feedback,
            reward_sampler=dataset.sample_reward,  # p(r|x,a)
            policy=evaluation_policy,
            n_sim=n_sim,  # the number of simulations
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
        )

        return relative_ee_i
Ejemplo n.º 5
0
def test_meta_evaluate_performance_of_estimators_using_invalid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description_1: str,
    metric,
    ground_truth_policy_value,
    description_2: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of evaluate_performance_of_estimators using invalid data
    """
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm])
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            action_dist=action_dist,
            metric=metric,
        )
    # estimate_intervals function is called in summarize_off_policy_estimates
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.summarize_estimators_comparison(
            ground_truth_policy_value=ground_truth_policy_value,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            action_dist=action_dist,
            metric=metric,
        )
Ejemplo n.º 6
0
    def process(i: int):
        # synthetic data generator with uniformly random policy
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=None,  # uniformly random
            random_state=i,
        )
        # sample new data of synthetic logged bandit feedback
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # simulate the evaluation policy
        action_dist = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                            policy=evaluation_policy)
        # estimate the ground-truth policy values of the evaluation policy
        # by Monte-Carlo Simulation using p(r|x,a), the reward distribution
        ground_truth_policy_value = calc_ground_truth_policy_value(
            bandit_feedback=bandit_feedback,
            reward_sampler=dataset.sample_reward,  # p(r|x,a)
            policy=evaluation_policy,
            n_sim=n_sim,  # the number of simulations
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        metric_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
        )

        return metric_i
Ejemplo n.º 7
0
def test_meta_estimate_intervals_using_invalid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description_1: str,
    alpha,
    n_bootstrap_samples,
    random_state,
    description_2: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_intervals using invalid data
    """
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[dm])
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.estimate_intervals(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )
    # estimate_intervals function is called in summarize_off_policy_estimates
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.summarize_off_policy_estimates(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )
Ejemplo n.º 8
0
def test_meta_post_init(synthetic_bandit_feedback: BanditFeedback) -> None:
    """
    Test the __post_init__ function
    """
    # __post_init__ saves the latter estimator when the same estimator name is used
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw2])
    assert ope_.ope_estimators_ == {
        "ipw": ipw2
    }, "__post_init__ returns a wrong value"
    # __post_init__ can handle the same estimator if the estimator names are different
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    assert ope_.ope_estimators_ == {
        "ipw": ipw,
        "ipw3": ipw3,
    }, "__post_init__ returns a wrong value"
    # __post__init__ raises RuntimeError when necessary_keys are not included in the bandit_feedback
    necessary_keys = ["action", "position", "reward", "pscore"]
    for i in range(len(necessary_keys)):
        for deleted_keys in itertools.combinations(necessary_keys, i + 1):
            invalid_bandit_feedback_dict = {key: "_" for key in necessary_keys}
            # delete
            for k in deleted_keys:
                del invalid_bandit_feedback_dict[k]
            with pytest.raises(RuntimeError, match=r"Missing key*"):
                _ = OffPolicyEvaluation(
                    bandit_feedback=invalid_bandit_feedback_dict,
                    ope_estimators=[ipw])
Ejemplo n.º 9
0
def test_meta_evaluate_performance_of_estimators_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description_1: str,
    metric,
    ground_truth_policy_value,
    description_2: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of evaluate_performance_of_estimators using valid data
    """
    if metric == "relative-ee":
        # calculate relative-ee
        eval_metric_ope_dict = {
            "ipw":
            np.abs((mock_policy_value + ipw.eps - ground_truth_policy_value) /
                   ground_truth_policy_value),
            "ipw3":
            np.abs((mock_policy_value + ipw3.eps - ground_truth_policy_value) /
                   ground_truth_policy_value),
        }
    else:
        # calculate se
        eval_metric_ope_dict = {
            "ipw":
            (mock_policy_value + ipw.eps - ground_truth_policy_value)**2,
            "ipw3":
            (mock_policy_value + ipw3.eps - ground_truth_policy_value)**2,
        }
    # check performance estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    performance = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=ground_truth_policy_value,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        action_dist=action_dist,
        metric=metric,
    )
    for k, v in performance.items():
        assert k in eval_metric_ope_dict, "Invalid key of performance response"
        assert v == eval_metric_ope_dict[
            k], "Invalid value of performance response"
    performance_df = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=ground_truth_policy_value,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        action_dist=action_dist,
        metric=metric,
    )
    assert_frame_equal(
        performance_df,
        pd.DataFrame(eval_metric_ope_dict,
                     index=[metric]).T), "Invalid summarization (performance)"
    def process(i: int):
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"],
            tau=0.1,  # temperature hyperparameter
        )
        # estimate the ground-truth policy values of the evaluation policy
        # using the full expected reward contained in the test set of synthetic bandit feedback
        ground_truth_policy_value = np.average(
            bandit_feedback_test["expected_reward"],
            weights=action_dist[:, :, 0],
            axis=1,
        ).mean()
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            len_list=dataset.len_list,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Ejemplo n.º 11
0
def test_meta_estimate_policy_values_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_policy_values using valid data
    """
    # single ope estimator
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]
    )
    ope_.is_model_dependent = True
    assert ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    ) == {
        "dm": mock_policy_value
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod]) returns a wrong value"
    # multiple ope estimators
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm, ipw]
    )
    ope_.is_model_dependent = True
    assert ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    ) == {
        "dm": mock_policy_value,
        "ipw": mock_policy_value + ipw.eps,
    }, "OffPolicyEvaluation.estimate_policy_values ([DirectMethod, IPW]) returns a wrong value"
Ejemplo n.º 12
0
    def process(b: int):
        # load the pre-trained regression model
        with open(reg_model_path / f"reg_model_{b}.pkl", "rb") as f:
            reg_model = pickle.load(f)
        with open(reg_model_path / f"reg_model_mrdr_{b}.pkl", "rb") as f:
            reg_model_mrdr = pickle.load(f)
        with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "rb") as f:
            is_for_reg_model = pickle.load(f)
        # sample bootstrap samples from batch logged bandit feedback
        if is_timeseries_split:
            bandit_feedback = obd.sample_bootstrap_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=is_timeseries_split,
                random_state=b,
            )
        else:
            bandit_feedback = obd.sample_bootstrap_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=is_timeseries_split,
                random_state=b,
            )
            bandit_feedback["n_rounds"] = (~is_for_reg_model).sum()
            for key_ in ["context", "action", "reward", "pscore", "position"]:
                bandit_feedback[key_] = bandit_feedback[key_][
                    ~is_for_reg_model]
        # estimate the mean reward function using the pre-trained reg_model
        estimated_rewards_by_reg_model_default = reg_model.predict(
            context=bandit_feedback["context"], )
        estimated_rewards_by_reg_model_mrdr = reg_model_mrdr.predict(
            context=bandit_feedback["context"], )
        estimated_rewards_by_reg_model = {
            estimator.estimator_name:
            estimated_rewards_by_reg_model_mrdr if estimator.estimator_name
            == "mrdr" else estimated_rewards_by_reg_model_default
            for estimator in ope_estimators
        }
        # evaluate the estimation performance of OPE estimators
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        action_dist = np.tile(action_dist_single_round,
                              (bandit_feedback["n_rounds"], 1, 1))
        relative_ee_b = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_b
Ejemplo n.º 13
0
def test_response_format_of_ope_estimators_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the response format of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # test all estimators
    all_estimators = ope.__all_estimators__
    estimators = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
    ]
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators)
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward)
    estimated_intervals = ope_instance.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        random_state=12345,
    )
    # check the format of OPE
    for key in estimated_policy_value:
        # check the keys of the output dictionary of the estimate_intervals method
        assert set(estimated_intervals[key].keys()) == set([
            "mean", "95.0% CI (lower)", "95.0% CI (upper)"
        ]), f"Confidence interval of {key} has invalid keys"
        # check the relationship between the means and the confidence bounds estimated by OPE estimators
        assert (
            estimated_intervals[key]["95.0% CI (lower)"] <=
            estimated_policy_value[key]
        ) and (
            estimated_intervals[key]["95.0% CI (upper)"] >=
            estimated_policy_value[key]
        ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator"
        assert (estimated_intervals[key]["mean"] >=
                estimated_intervals[key]["95.0% CI (lower)"]
                ), f"Invalid confidence interval of {key}: lower bound > mean"
        assert (estimated_intervals[key]["mean"] <=
                estimated_intervals[key]["95.0% CI (upper)"]
                ), f"Invalid confidence interval of {key}: upper bound < mean"
Ejemplo n.º 14
0
def test_meta_create_estimator_inputs_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using invalid data
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw]
    )
    estimator_inputs = ope_._create_estimator_inputs(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    assert set(estimator_inputs.keys()) == set(["ipw"])
    assert set(estimator_inputs["ipw"].keys()) == set(
        [
            "reward",
            "action",
            "pscore",
            "position",
            "action_dist",
            "estimated_rewards_by_reg_model",
            "estimated_pscore",
            "estimated_importance_weights",
            "p_e_a",
            "pi_b",
            "context",
            "action_embed",
        ]
    ), f"Invalid response of _create_estimator_inputs (test case: {description})"
    # _create_estimator_inputs function is called in the following functions
    _ = ope_.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.summarize_off_policy_estimates(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=0.1,
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
    _ = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=0.1,
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
    )
Ejemplo n.º 15
0
def test_meta_post_init_format(synthetic_bandit_feedback: BanditFeedback,
                               random_action_dist: np.ndarray) -> None:
    """
    Test the post init format of OffPolicyEvaluation
    """
    # __post_init__ saves the latter estimator when the same estimator name is used
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw2])
    assert ope_.ope_estimators_ == {
        "ipw": ipw2
    }, "__post_init__ returns a wrong value"
    # __post_init__ can handle the same estimator if the estimator names are different
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    assert ope_.ope_estimators_ == {
        "ipw": ipw,
        "ipw3": ipw3,
    }, "__post_init__ returns a wrong value"
Ejemplo n.º 16
0
def test_meta_create_estimator_inputs_format(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the _create_estimator_inputs format of OffPolicyEvaluation
    """
    # __post_init__ saves the latter estimator when the same estimator name is used
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw])
    inputs = ope_._create_estimator_inputs(action_dist=None,
                                           estimated_rewards_by_reg_model=None)
    assert set(inputs.keys()) == set([
        "reward",
        "action",
        "pscore",
        "position",
        "action_dist",
        "estimated_rewards_by_reg_model",
    ]), "Invalid response format of _create_estimator_inputs"
Ejemplo n.º 17
0
def test_performance_of_ope_estimators_using_random_evaluation_policy(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :,
                                                                   np.newaxis]
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0],
                        weights=action_dist[:, :, 0],
                        axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    gt_std = q_pi_e.std(ddof=1)
    # test most of the estimators (ReplayMethod is not tested because it is out of scope)
    all_estimators = ope.__all_estimators__
    estimators = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
        if estimator_name not in ["ReplayMethod"]
    ]
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators)
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward)
    # check the performance of OPE
    ci_bound = gt_std * 3 / np.sqrt(q_pi_e.shape[0])
    print(f"gt_mean: {gt_mean}, 3 * gt_std / sqrt(n): {ci_bound}")
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) <= ci_bound
        ), f"OPE of {key} did not work well (absolute error is greater than 3*sigma)"
Ejemplo n.º 18
0
    def process(i: int):
        # split the original data into training and evaluation sets
        dataset.split_train_eval(eval_size=eval_size, random_state=i)
        # obtain logged bandit feedback generated by behavior policy
        bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i)
        # obtain action choice probabilities by an evaluation policy
        action_dist = dataset.obtain_action_dist_by_eval_policy(
            base_classifier_e=base_model_dict[base_model_for_evaluation_policy]
            (**hyperparams[base_model_for_evaluation_policy]),
            alpha_e=alpha_e,
        )
        # calculate the ground-truth performance of the evaluation policy
        ground_truth_policy_value = dataset.calc_ground_truth_policy_value(
            action_dist=action_dist)
        # estimate the mean reward function of the evaluation set of multi-class classification data with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            reward=bandit_feedback["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Ejemplo n.º 19
0
def test_meta_evaluate_performance_of_estimators(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    gt = 0.5
    # calculate relative-ee
    eval_metric_ope_dict = {
        "ipw": np.abs((mock_policy_value + ipw.eps - gt) / gt),
        "ipw3": np.abs((mock_policy_value + ipw3.eps - gt) / gt),
    }
    # check performance estimators
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    performance = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=gt,
        action_dist=random_action_dist,
        metric="relative-ee",
    )
    for k, v in performance.items():
        assert k in eval_metric_ope_dict, "Invalid key of performance response"
        assert v == eval_metric_ope_dict[
            k], "Invalid value of performance response"
    # zero division error when using relative-ee
    with pytest.raises(ZeroDivisionError, match=r"float division by zero"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=0.0,
            action_dist=random_action_dist,
            metric="relative-ee",
        )
    # check summarization
    performance_df = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=gt,
        action_dist=random_action_dist,
        metric="relative-ee",
    )
    assert_frame_equal(
        performance_df,
        pd.DataFrame(eval_metric_ope_dict,
                     index=["relative-ee"
                            ]).T), "Invalid summarization (performance)"
Ejemplo n.º 20
0
def test_meta_summarize_off_policy_estimates(
        synthetic_bandit_feedback: BanditFeedback,
        random_action_dist: np.ndarray) -> None:
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    value, interval = ope_.summarize_off_policy_estimates(random_action_dist)
    expected_value = pd.DataFrame(
        {
            "ipw": mock_policy_value + ipw.eps,
            "ipw3": mock_policy_value + ipw3.eps,
        },
        index=["estimated_policy_value"],
    ).T
    expected_interval = pd.DataFrame({
        "ipw": {k: v + ipw.eps
                for k, v in mock_confidence_interval.items()},
        "ipw3": {k: v + ipw3.eps
                 for k, v in mock_confidence_interval.items()},
    }).T
    assert_frame_equal(value,
                       expected_value), "Invalid summarization (policy value)"
    assert_frame_equal(interval,
                       expected_interval), "Invalid summarization (interval)"
Ejemplo n.º 21
0
def test_meta_create_estimator_inputs_using_invalid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using valid data
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[ipw]
    )
    # raise ValueError when the shape of two arrays are different
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_._create_estimator_inputs(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    # _create_estimator_inputs function is called in the following functions
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_policy_values(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_intervals(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_off_policy_estimates(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=0.1,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_estimators_comparison(
            ground_truth_policy_value=0.1,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
Ejemplo n.º 22
0
def test_meta_estimate_intervals_using_valid_input_data(
    action_dist,
    estimated_rewards_by_reg_model,
    description_1: str,
    alpha: float,
    n_bootstrap_samples: int,
    random_state: int,
    description_2: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_intervals using valid data
    """
    # single ope estimator
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm]
    )
    assert ope_.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    ) == {
        "dm": mock_confidence_interval
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod]) returns a wrong value"
    # multiple ope estimators
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[dm, ipw]
    )
    assert ope_.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    ) == {
        "dm": mock_confidence_interval,
        "ipw": {k: v + ipw.eps for k, v in mock_confidence_interval.items()},
    }, "OffPolicyEvaluation.estimate_intervals ([DirectMethod, IPW]) returns a wrong value"
Ejemplo n.º 23
0
def test_meta_estimated_rewards_by_reg_model_inputs(
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the estimate_policy_values/estimate_intervals functions wrt estimated_rewards_by_reg_model
    """
    ope_ = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=[DirectMethod()]
    )

    action_dist = np.zeros(
        (synthetic_bandit_feedback["n_rounds"], synthetic_bandit_feedback["n_actions"])
    )
    with pytest.raises(ValueError):
        ope_.estimate_policy_values(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=None,
        )

    with pytest.raises(ValueError):
        ope_.estimate_intervals(
            action_dist=action_dist,
            estimated_rewards_by_reg_model=None,
        )
Ejemplo n.º 24
0
def test_performance_of_ope_estimators_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    # test most of the estimators (ReplayMethod is not tested because it is out of scope)
    all_estimators = ope.__all_estimators__
    estimators_standard = [
        getattr(ope.estimators, estimator_name)()
        for estimator_name in all_estimators
        if estimator_name not in ["ReplayMethod"]
    ]
    all_estimators_tuning = ope.__all_estimators_tuning__
    estimators_tuning = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[1, 100, 10000, np.inf],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning
        for tuning_method in ["slope", "mse"]
    ]
    all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__
    estimators_tuning_sg = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[0.001, 0.01, 0.1, 1.0],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning_sg
        for tuning_method in ["slope", "mse"]
    ]
    estimators = estimators_standard + estimators_tuning + estimators_tuning_sg
    # skip estimation
    estimated_pscore = None
    estimated_importance_weights = (
        random_action_dist[
            np.arange(synthetic_bandit_feedback["action"].shape[0]),
            synthetic_bandit_feedback["action"],
            np.zeros(
                synthetic_bandit_feedback["action"].shape[0], dtype=int
            ),  # position is None
        ]
        / synthetic_bandit_feedback["pscore"]
    )
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators
    )
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
    )
    # check the performance of OPE
    print(f"gt_mean: {gt_mean}")
    for key in estimated_policy_value:
        print(
            f"estimated_value: {estimated_policy_value[key]} ------ estimator: {key}, "
        )
        # test the performance of each estimator
        assert (
            np.abs(gt_mean - estimated_policy_value[key]) / gt_mean <= 0.1
        ), f"OPE of {key} did not work well (relative absolute error is greater than 10%)"
    # ground-truth policy value of the random policy
    # , which is the empirical mean of the factual (observed) rewards (on-policy estimation)
    ground_truth = bandit_feedback["reward"].mean()

    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(
        HistGradientBoostingClassifier(**hyperparams))
    # run a counterfactual bandit algorithm on logged bandit feedback data
    selected_actions = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                             policy=policy)
    # estimate the policy value of a given counterfactual algorithm by the three OPE estimators.
    ope = OffPolicyEvaluation(
        bandit_feedback=bandit_feedback,
        regression_model=RegressionModel(base_model=base_model),
        action_context=obd.action_context,
        ope_estimators=[
            InverseProbabilityWeighting(),
            DirectMethod(),
            DoublyRobust()
        ],
    )
    estimated_policy_value, estimated_interval = ope.summarize_off_policy_estimates(
        selected_actions=selected_actions)

    # calculate estimated policy value relative to that of the behavior policy
    print("=" * 70)
    print(f"random_state={random_state}: counterfactual policy={policy_name}")
    print("-" * 70)
    estimated_policy_value["relative_estimated_policy_value"] = (
        estimated_policy_value.estimated_policy_value / ground_truth)
    print(estimated_policy_value)
    print("=" * 70)
Ejemplo n.º 26
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            dim_context=dim_context,
            beta=3.0,
            n_cat_dim=3,
            n_cat_per_dim=5,
            reward_function=logistic_reward_function,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_iw_estimator](
                **hyperparams[base_model_for_iw_estimator]),
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit data
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit data
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"], )
        # estimate the reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=2,
            random_state=12345,
        )
        # fit propensity score estimators
        pscore_estimator = PropensityScoreEstimator(
            len_list=1,
            n_actions=n_actions,
            base_model=base_model_dict[base_model_for_pscore_estimator](
                **hyperparams[base_model_for_pscore_estimator]),
            calibration_cv=3,
        )
        estimated_pscore = pscore_estimator.fit_predict(
            action=bandit_feedback_test["action"],
            position=bandit_feedback_test["position"],
            context=bandit_feedback_test["context"],
            n_folds=3,
            random_state=12345,
        )
        # fit importance weight estimators
        estimated_importance_weights_dict = {}
        for clf_name, clf_arguments in bipw_model_configurations.items():
            clf = ImportanceWeightEstimator(
                len_list=1,
                n_actions=n_actions,
                fitting_method=clf_arguments["fitting_method"],
                base_model=clf_arguments["base_model"],
            )
            estimated_importance_weights_dict[clf_name] = clf.fit_predict(
                action=bandit_feedback_test["action"],
                context=bandit_feedback_test["context"],
                action_dist=action_dist,
                position=bandit_feedback_test["position"],
                n_folds=2,
                evaluate_model_performance=False,
                random_state=12345,
            )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators + [
                MarginalizedInverseProbabilityWeighting(n_actions=n_actions,
                                                        estimator_name="mipw"),
                MarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions,
                    embedding_selection_method="greedy",
                    estimator_name="mipw (greedy selection)",
                ),
                SelfNormalizedMarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions, estimator_name="snmipw"),
            ],
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            estimated_pscore=estimated_pscore,
            estimated_importance_weights=estimated_importance_weights_dict,
            action_embed=bandit_feedback_test["action_embed"],
            pi_b=bandit_feedback_test["pi_b"],
            metric="relative-ee",
        )

        return relative_ee_i
Ejemplo n.º 27
0
    )

    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators
    }
    for b in np.arange(n_boot_samples):
        # sample bootstrap from batch logged bandit feedback
        boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
        # run a counterfactual bandit algorithm on logged bandit feedback data
        selected_actions = run_bandit_simulation(
            bandit_feedback=boot_bandit_feedback, policy=policy
        )
        # evaluate the estimation performance of OPE estimators
        ope = OffPolicyEvaluation(
            bandit_feedback=boot_bandit_feedback,
            action_context=obd.action_context,
            regression_model=RegressionModel(base_model=base_model),
            ope_estimators=ope_estimators,
        )
        relative_estimation_errors = ope.evaluate_performance_of_estimators(
            selected_actions=selected_actions,
            ground_truth_policy_value=ground_truth_policy_value,
        )
        policy.initialize()
        # store relative estimation errors of OPE estimators at each split
        for (
            estimator_name,
            relative_estimation_error,
        ) in relative_estimation_errors.items():
            evaluation_of_ope_results[estimator_name][b] = relative_estimation_error

    # estimate confidence intervals of relative estimation by nonparametric bootstrap method
Ejemplo n.º 28
0
def test_response_format_of_ope_estimators_using_random_evaluation_policy(
    synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the response format of ope estimators using synthetic bandit data and random evaluation policy
    """
    expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis]
    action_dist = random_action_dist
    # test all estimators
    all_estimators = ope.__all_estimators__
    estimators_standard = [
        getattr(ope.estimators, estimator_name)() for estimator_name in all_estimators
    ]
    all_estimators_tuning = ope.__all_estimators_tuning__
    estimators_tuning = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[1, 100, 10000, np.inf],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning
        for tuning_method in ["slope", "mse"]
    ]
    all_estimators_tuning_sg = ope.__all_estimators_tuning_sg__
    estimators_tuning_sg = [
        getattr(ope.estimators_tuning, estimator_name)(
            lambdas=[0.001, 0.01, 0.1, 1.0],
            tuning_method=tuning_method,
        )
        for estimator_name in all_estimators_tuning_sg
        for tuning_method in ["slope", "mse"]
    ]
    estimators = estimators_standard + estimators_tuning + estimators_tuning_sg
    # skip estimation
    estimated_pscore = None
    estimated_importance_weights = (
        random_action_dist[
            np.arange(synthetic_bandit_feedback["action"].shape[0]),
            synthetic_bandit_feedback["action"],
            np.zeros(
                synthetic_bandit_feedback["action"].shape[0], dtype=int
            ),  # position is None
        ]
        / synthetic_bandit_feedback["pscore"]
    )
    # conduct OPE
    ope_instance = OffPolicyEvaluation(
        bandit_feedback=synthetic_bandit_feedback, ope_estimators=estimators
    )
    estimated_policy_value = ope_instance.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
    )
    estimated_intervals = ope_instance.estimate_intervals(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=expected_reward,
        estimated_pscore=estimated_pscore,
        estimated_importance_weights=estimated_importance_weights,
        random_state=12345,
    )
    # check the format of OPE
    for key in estimated_policy_value:
        # check the keys of the output dictionary of the estimate_intervals method
        assert set(estimated_intervals[key].keys()) == set(
            ["mean", "95.0% CI (lower)", "95.0% CI (upper)"]
        ), f"Confidence interval of {key} has invalid keys"
        # check the relationship between the means and the confidence bounds estimated by OPE estimators
        assert (
            estimated_intervals[key]["95.0% CI (lower)"] <= estimated_policy_value[key]
        ) and (
            estimated_intervals[key]["95.0% CI (upper)"] >= estimated_policy_value[key]
        ), f"Estimated policy value of {key} is not included in estimated intervals of that estimator"
        assert (
            estimated_intervals[key]["mean"]
            >= estimated_intervals[key]["95.0% CI (lower)"]
        ), f"Invalid confidence interval of {key}: lower bound > mean"
        assert (
            estimated_intervals[key]["mean"]
            <= estimated_intervals[key]["95.0% CI (upper)"]
        ), f"Invalid confidence interval of {key}: upper bound < mean"
Ejemplo n.º 29
0
def test_meta_summarize_off_policy_estimates(
    action_dist,
    estimated_rewards_by_reg_model,
    description_1: str,
    alpha: float,
    n_bootstrap_samples: int,
    random_state: int,
    description_2: str,
    synthetic_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of summarize_off_policy_estimates using valid data
    """
    ope_ = OffPolicyEvaluation(bandit_feedback=synthetic_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    value, interval = ope_.summarize_off_policy_estimates(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    )
    expected_value = pd.DataFrame(
        {
            "ipw": mock_policy_value + ipw.eps,
            "ipw3": mock_policy_value + ipw3.eps,
        },
        index=["estimated_policy_value"],
    ).T
    expected_value["relative_estimated_policy_value"] = (
        expected_value["estimated_policy_value"] /
        synthetic_bandit_feedback["reward"].mean())
    expected_interval = pd.DataFrame({
        "ipw": {k: v + ipw.eps
                for k, v in mock_confidence_interval.items()},
        "ipw3": {k: v + ipw3.eps
                 for k, v in mock_confidence_interval.items()},
    }).T
    assert_frame_equal(value,
                       expected_value), "Invalid summarization (policy value)"
    assert_frame_equal(interval,
                       expected_interval), "Invalid summarization (interval)"
    # check relative estimated policy value when the average of bandit_feedback["reward"] is zero
    zero_reward_bandit_feedback = deepcopy(synthetic_bandit_feedback)
    zero_reward_bandit_feedback["reward"] = np.zeros(
        zero_reward_bandit_feedback["reward"].shape[0])
    ope_ = OffPolicyEvaluation(bandit_feedback=zero_reward_bandit_feedback,
                               ope_estimators=[ipw, ipw3])
    value, _ = ope_.summarize_off_policy_estimates(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    )
    expected_value = pd.DataFrame(
        {
            "ipw": mock_policy_value + ipw.eps,
            "ipw3": mock_policy_value + ipw3.eps,
        },
        index=["estimated_policy_value"],
    ).T
    expected_value["relative_estimated_policy_value"] = np.nan
    assert_frame_equal(value,
                       expected_value), "Invalid summarization (policy value)"
            action_dist = policy.compute_batch_action_dist(
                n_sim=100000, n_rounds=boot_bandit_feedback["n_rounds"])
        else:
            policy = Random(
                n_actions=obd.n_actions,
                len_list=obd.len_list,
                random_state=random_state,
            )
            action_dist = policy.compute_batch_action_dist(
                n_sim=100000, n_rounds=boot_bandit_feedback["n_rounds"])
        # estimate the mean reward function using the pre-trained reg_model
        estimated_rewards_by_reg_model = reg_model.predict(
            context=boot_bandit_feedback["context"], )
        # evaluate the estimation performance of OPE estimators
        ope = OffPolicyEvaluation(
            bandit_feedback=boot_bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_estimation_errors = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        # store relative estimation errors of OPE estimators at each bootstrap
        for (
                estimator_name,
                relative_estimation_error,
        ) in relative_estimation_errors.items():
            relative_ee[estimator_name][b] = relative_estimation_error

        print(
            f"{b+1}th iteration: {np.round((time.time() - start) / 60, 2)}min")