Example #1
0
def test_meta_evaluate_performance_of_estimators_using_invalid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description_1: str,
    metric,
    ground_truth_policy_value,
    description_2: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of evaluate_performance_of_estimators using invalid data
    """
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[iips])
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            evaluation_policy_pscore_item_position=
            evaluation_policy_pscore_item_position,
            metric=metric,
        )
    # estimate_intervals function is called in summarize_off_policy_estimates
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.summarize_estimators_comparison(
            ground_truth_policy_value=ground_truth_policy_value,
            evaluation_policy_pscore_item_position=
            evaluation_policy_pscore_item_position,
            metric=metric,
        )
Example #2
0
def test_meta_estimate_intervals_using_invalid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description_1: str,
    alpha,
    n_bootstrap_samples,
    random_state,
    description_2: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_intervals using invalid data
    """
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[iips])
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.estimate_intervals(
            evaluation_policy_pscore_item_position=
            evaluation_policy_pscore_item_position,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )
    # estimate_intervals function is called in summarize_off_policy_estimates
    with pytest.raises(ValueError, match=f"{description_2}*"):
        _ = ope_.summarize_off_policy_estimates(
            evaluation_policy_pscore_item_position=
            evaluation_policy_pscore_item_position,
            alpha=alpha,
            n_bootstrap_samples=n_bootstrap_samples,
            random_state=random_state,
        )
Example #3
0
def test_meta_post_init(
        synthetic_slate_bandit_feedback: BanditFeedback) -> None:
    """
    Test the __post_init__ function
    """
    # __post_init__ saves the latter estimator when the same estimator name is used
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[sips, sips2])
    assert ope_.ope_estimators_ == {
        "sips": sips2
    }, "__post_init__ returns a wrong value"
    # __post_init__ can handle the same estimator if the estimator names are different
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[sips, sips3])
    assert ope_.ope_estimators_ == {
        "sips": sips,
        "sips3": sips3,
    }, "__post_init__ returns a wrong value"
    # __post__init__ raises RuntimeError when necessary_keys are not included in the bandit_feedback
    necessary_keys = ["slate_id", "position", "reward"]
    for i in range(len(necessary_keys)):
        for deleted_keys in itertools.combinations(necessary_keys, i + 1):
            invalid_bandit_feedback_dict = {key: "_" for key in necessary_keys}
            # delete
            for k in deleted_keys:
                del invalid_bandit_feedback_dict[k]
            with pytest.raises(RuntimeError, match=r"Missing key*"):
                _ = SlateOffPolicyEvaluation(
                    bandit_feedback=invalid_bandit_feedback_dict,
                    ope_estimators=[sips])
Example #4
0
def test_meta_evaluate_performance_of_estimators_using_valid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    evaluation_policy_action_dist,
    q_hat,
    description_1: str,
    metric,
    ground_truth_policy_value,
    description_2: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of evaluate_performance_of_estimators using valid data
    """
    if metric == "relative-ee":
        # calculate relative-ee
        eval_metric_ope_dict = {
            "sips":
            np.abs((mock_policy_value + sips.eps - ground_truth_policy_value) /
                   ground_truth_policy_value),
            "sips3":
            np.abs(
                (mock_policy_value + sips3.eps - ground_truth_policy_value) /
                ground_truth_policy_value),
        }
    else:
        # calculate se
        eval_metric_ope_dict = {
            "sips":
            (mock_policy_value + sips.eps - ground_truth_policy_value)**2,
            "sips3":
            (mock_policy_value + sips3.eps - ground_truth_policy_value)**2,
        }
    # check performance estimators
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[sips, sips3])
    performance = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=ground_truth_policy_value,
        evaluation_policy_pscore=evaluation_policy_pscore,
        metric=metric,
    )
    for k, v in performance.items():
        assert k in eval_metric_ope_dict, "Invalid key of performance response"
        assert v == eval_metric_ope_dict[
            k], "Invalid value of performance response"
    performance_df = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=ground_truth_policy_value,
        evaluation_policy_pscore=evaluation_policy_pscore,
        metric=metric,
    )
    assert_frame_equal(
        performance_df,
        pd.DataFrame(eval_metric_ope_dict,
                     index=[metric]).T), "Invalid summarization (performance)"
Example #5
0
def test_meta_create_estimator_inputs_using_invalid_input_data(
    evaluation_policy_pscore,
    description: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using valid data and a sips estimator
    """
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[sips])
    # raise ValueError when the shape of two arrays are different
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_._create_estimator_inputs(
            evaluation_policy_pscore=evaluation_policy_pscore)
    # _create_estimator_inputs function is called in the following functions
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_policy_values(
            evaluation_policy_pscore=evaluation_policy_pscore)
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.estimate_intervals(
            evaluation_policy_pscore=evaluation_policy_pscore)
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_off_policy_estimates(
            evaluation_policy_pscore=evaluation_policy_pscore)
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.evaluate_performance_of_estimators(
            ground_truth_policy_value=0.1,
            evaluation_policy_pscore=evaluation_policy_pscore,
        )
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = ope_.summarize_estimators_comparison(
            ground_truth_policy_value=0.1,
            evaluation_policy_pscore=evaluation_policy_pscore,
        )
Example #6
0
def test_meta_estimate_policy_values_using_valid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_policy_values using valid data
    """
    # single ope estimator (iips)
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[iips])
    assert ope_.estimate_policy_values(
        evaluation_policy_pscore_item_position=
        evaluation_policy_pscore_item_position
    ) == {
        "iips": mock_policy_value
    }, "SlateOffPolicyEvaluation.estimate_policy_values ([IIPS]) returns a wrong value"
    # multiple ope estimators
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[iips, sips, rips],
    )
    assert ope_.estimate_policy_values(
        evaluation_policy_pscore=evaluation_policy_pscore,
        evaluation_policy_pscore_item_position=
        evaluation_policy_pscore_item_position,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade,
    ) == {
        "iips": mock_policy_value,
        "sips": mock_policy_value + sips.eps,
        "rips": mock_policy_value,
    }, "SlateOffPolicyEvaluation.estimate_policy_values ([IIPS, SIPS, RIPS]) returns a wrong value"
Example #7
0
def test_meta_estimate_policy_values_using_various_pscores(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    necessary_keys = [
        "reward",
        "position",
        "evaluation_policy_pscore",
        "evaluation_policy_pscore_item_position",
        "evaluation_policy_pscore_cascade"
        "slate_id",
    ]
    pscore_keys = [
        "pscore",
        "pscore_item_position",
        "pscore_cascade",
    ]
    # TypeError must be raised when required positional arguments are missing
    for i in range(len(necessary_keys)):
        for deleted_keys in itertools.combinations(pscore_keys, i + 1):
            copied_feedback = deepcopy(synthetic_slate_bandit_feedback)
            # delete
            for k in deleted_keys:
                del copied_feedback[k]
            with pytest.raises(
                    TypeError,
                    match=re.escape("estimate_policy_value() missing"),
            ):
                ope_ = SlateOffPolicyEvaluation(
                    bandit_feedback=copied_feedback,
                    ope_estimators=[sips, iips, rips],
                )
                _ = ope_.estimate_policy_values(
                    evaluation_policy_pscore=evaluation_policy_pscore,
                    evaluation_policy_pscore_item_position=
                    evaluation_policy_pscore_item_position,
                    evaluation_policy_pscore_cascade=
                    evaluation_policy_pscore_cascade,
                )
    # pscore_item_position and evaluation_policy_pscore_item_position are not necessary when iips is not evaluated
    copied_feedback = deepcopy(synthetic_slate_bandit_feedback)
    del copied_feedback["pscore_item_position"]
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=copied_feedback,
        ope_estimators=[sips, rips],
    )
    _ = ope_.estimate_policy_values(
        evaluation_policy_pscore=evaluation_policy_pscore,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade,
    )
Example #8
0
def test_meta_create_estimator_inputs_using_valid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the _create_estimator_inputs using invalid data
    """
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[sips])
    estimator_inputs = ope_._create_estimator_inputs(
        evaluation_policy_pscore=evaluation_policy_pscore)
    assert set(estimator_inputs.keys()) == set(
        [
            "reward",
            "pscore",
            "pscore_item_position",
            "pscore_cascade",
            "position",
            "evaluation_policy_pscore",
            "evaluation_policy_pscore_item_position",
            "evaluation_policy_pscore_cascade",
            "slate_id",
        ]
    ), f"Invalid response of _create_estimator_inputs (test case: {description})"
    # _create_estimator_inputs function is called in the following functions
    _ = ope_.estimate_policy_values(
        evaluation_policy_pscore=evaluation_policy_pscore)
    _ = ope_.estimate_intervals(
        evaluation_policy_pscore=evaluation_policy_pscore)
    _ = ope_.summarize_off_policy_estimates(
        evaluation_policy_pscore=evaluation_policy_pscore)
    _ = ope_.evaluate_performance_of_estimators(
        ground_truth_policy_value=0.1,
        evaluation_policy_pscore=evaluation_policy_pscore)
    _ = ope_.summarize_estimators_comparison(
        ground_truth_policy_value=0.1,
        evaluation_policy_pscore=evaluation_policy_pscore)
Example #9
0
def test_meta_estimate_intervals_using_valid_input_data(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    evaluation_policy_action_dist,
    q_hat,
    description_1: str,
    alpha: float,
    n_bootstrap_samples: int,
    random_state: int,
    description_2: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of estimate_intervals using valid data
    """
    # single ope estimator
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback, ope_estimators=[iips])
    assert ope_.estimate_intervals(
        evaluation_policy_pscore_item_position=
        evaluation_policy_pscore_item_position,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    ) == {
        "iips": mock_confidence_interval
    }, "SlateOffPolicyEvaluation.estimate_intervals ([IIPS]) returns a wrong value"
    # multiple ope estimators
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[iips, rips, cascade_dr, sips],
    )
    assert ope_.estimate_intervals(
        evaluation_policy_pscore=evaluation_policy_pscore,
        evaluation_policy_pscore_cascade=evaluation_policy_pscore_cascade,
        evaluation_policy_pscore_item_position=
        evaluation_policy_pscore_item_position,
        evaluation_policy_action_dist=evaluation_policy_action_dist,
        q_hat=q_hat,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    ) == {
        "iips": mock_confidence_interval,
        "rips": mock_confidence_interval,
        "cascade-dr": mock_confidence_interval,
        "sips": {k: v + sips.eps
                 for k, v in mock_confidence_interval.items()},
    }, "SlateOffPolicyEvaluation.estimate_intervals ([IIPS, SIPS]) returns a wrong value"
Example #10
0
def test_meta_summarize_off_policy_estimates(
    evaluation_policy_pscore,
    evaluation_policy_pscore_item_position,
    evaluation_policy_pscore_cascade,
    description_1: str,
    alpha: float,
    n_bootstrap_samples: int,
    random_state: int,
    description_2: str,
    synthetic_slate_bandit_feedback: BanditFeedback,
) -> None:
    """
    Test the response of summarize_off_policy_estimates using valid data
    """
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=synthetic_slate_bandit_feedback,
        ope_estimators=[sips, sips3])
    value, interval = ope_.summarize_off_policy_estimates(
        evaluation_policy_pscore=evaluation_policy_pscore,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    )
    expected_value = pd.DataFrame(
        {
            "sips": mock_policy_value + sips.eps,
            "sips3": mock_policy_value + sips3.eps,
        },
        index=["estimated_policy_value"],
    ).T
    expected_value["relative_estimated_policy_value"] = expected_value[
        "estimated_policy_value"] / (
            synthetic_slate_bandit_feedback["reward"].sum() /
            np.unique(synthetic_slate_bandit_feedback["slate_id"]).shape[0])
    expected_interval = pd.DataFrame({
        "sips": {k: v + sips.eps
                 for k, v in mock_confidence_interval.items()},
        "sips3":
        {k: v + sips3.eps
         for k, v in mock_confidence_interval.items()},
    }).T
    assert_frame_equal(value,
                       expected_value), "Invalid summarization (policy value)"
    assert_frame_equal(interval,
                       expected_interval), "Invalid summarization (interval)"
    # check relative estimated policy value when the average of bandit_feedback["reward"] is zero
    zero_reward_bandit_feedback = deepcopy(synthetic_slate_bandit_feedback)
    zero_reward_bandit_feedback["reward"] = np.zeros(
        zero_reward_bandit_feedback["reward"].shape[0])
    ope_ = SlateOffPolicyEvaluation(
        bandit_feedback=zero_reward_bandit_feedback,
        ope_estimators=[sips, sips3])
    value, _ = ope_.summarize_off_policy_estimates(
        evaluation_policy_pscore=evaluation_policy_pscore,
        alpha=alpha,
        n_bootstrap_samples=n_bootstrap_samples,
        random_state=random_state,
    )
    expected_value = pd.DataFrame(
        {
            "sips": mock_policy_value + sips.eps,
            "sips3": mock_policy_value + sips3.eps,
        },
        index=["estimated_policy_value"],
    ).T
    expected_value["relative_estimated_policy_value"] = np.nan
    assert_frame_equal(value,
                       expected_value), "Invalid summarization (policy value)"