コード例 #1
0
def test_synthetic_sample_reward_using_invalid_inputs(context, action,
                                                      description):
    n_actions = 10
    dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions)

    with pytest.raises(ValueError, match=f"{description}*"):
        _ = dataset.sample_reward(context=context, action=action)
コード例 #2
0
def test_synthetic_sample_reward_using_valid_inputs(context, action,
                                                    description):
    n_actions = 10
    dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions,
                                                     dim_context=3)

    reward = dataset.sample_reward(context=context, action=action)
    assert isinstance(reward, np.ndarray), "Invalid response of sample_reward"
    assert reward.shape == action.shape, "Invalid response of sample_reward"
コード例 #3
0
def test_synthetic_calc_policy_value_using_invalid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions)

    with pytest.raises(ValueError, match=f"{description}*"):
        _ = dataset.calc_ground_truth_policy_value(
            expected_reward=expected_reward, action_dist=action_dist)
コード例 #4
0
def test_synthetic_calc_policy_value_using_valid_inputs(
    expected_reward,
    action_dist,
    description,
):
    n_actions = 10
    dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions)

    policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=expected_reward, action_dist=action_dist)
    assert isinstance(
        policy_value,
        float), "Invalid response of calc_ground_truth_policy_value"
コード例 #5
0
def test_synthetic_init_using_invalid_inputs(
    n_actions,
    dim_context,
    reward_type,
    reward_std,
    beta,
    n_cat_per_dim,
    latent_param_mat_dim,
    n_cat_dim,
    p_e_a_param_std,
    n_unobserved_cat_dim,
    n_irrelevant_cat_dim,
    n_deficient_actions,
    action_context,
    random_state,
    err,
    description,
):
    with pytest.raises(err, match=f"{description}*"):
        _ = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_type=reward_type,
            reward_std=reward_std,
            beta=beta,
            n_deficient_actions=n_deficient_actions,
            n_cat_per_dim=n_cat_per_dim,
            latent_param_mat_dim=latent_param_mat_dim,
            n_cat_dim=n_cat_dim,
            p_e_a_param_std=p_e_a_param_std,
            n_unobserved_cat_dim=n_unobserved_cat_dim,
            n_irrelevant_cat_dim=n_irrelevant_cat_dim,
            action_context=action_context,
            random_state=random_state,
        )
コード例 #6
0
ファイル: conftest.py プロジェクト: aiueola/zr-obp
def synthetic_bandit_feedback_with_embed() -> BanditFeedback:
    n_actions = 10
    dim_context = 5
    n_cat_dim = 3
    n_cat_per_dim = 5
    random_state = 12345
    n_rounds = 10000
    dataset = SyntheticBanditDatasetWithActionEmbeds(
        n_actions=n_actions,
        dim_context=dim_context,
        n_cat_dim=n_cat_dim,
        n_cat_per_dim=n_cat_per_dim,
        reward_function=logistic_reward_function,
        random_state=random_state,
    )
    bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
    return bandit_feedback
コード例 #7
0
def test_synthetic_init():
    # when reward_function is None, expected_reward is randomly sampled in [0, 1]
    # this check includes the test of `sample_contextfree_expected_reward` function
    dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2, beta=0)
    assert len(dataset.expected_reward) == 2
    assert np.all(0 <= dataset.expected_reward) and np.all(
        dataset.expected_reward <= 1)

    # one-hot action_context when None is given
    ohe = np.eye(2, dtype=int)
    assert np.allclose(dataset.action_context, ohe)
コード例 #8
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            dim_context=dim_context,
            beta=3.0,
            n_cat_dim=3,
            n_cat_per_dim=5,
            reward_function=logistic_reward_function,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_iw_estimator](
                **hyperparams[base_model_for_iw_estimator]),
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit data
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit data
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"], )
        # estimate the reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=2,
            random_state=12345,
        )
        # fit propensity score estimators
        pscore_estimator = PropensityScoreEstimator(
            len_list=1,
            n_actions=n_actions,
            base_model=base_model_dict[base_model_for_pscore_estimator](
                **hyperparams[base_model_for_pscore_estimator]),
            calibration_cv=3,
        )
        estimated_pscore = pscore_estimator.fit_predict(
            action=bandit_feedback_test["action"],
            position=bandit_feedback_test["position"],
            context=bandit_feedback_test["context"],
            n_folds=3,
            random_state=12345,
        )
        # fit importance weight estimators
        estimated_importance_weights_dict = {}
        for clf_name, clf_arguments in bipw_model_configurations.items():
            clf = ImportanceWeightEstimator(
                len_list=1,
                n_actions=n_actions,
                fitting_method=clf_arguments["fitting_method"],
                base_model=clf_arguments["base_model"],
            )
            estimated_importance_weights_dict[clf_name] = clf.fit_predict(
                action=bandit_feedback_test["action"],
                context=bandit_feedback_test["context"],
                action_dist=action_dist,
                position=bandit_feedback_test["position"],
                n_folds=2,
                evaluate_model_performance=False,
                random_state=12345,
            )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators + [
                MarginalizedInverseProbabilityWeighting(n_actions=n_actions,
                                                        estimator_name="mipw"),
                MarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions,
                    embedding_selection_method="greedy",
                    estimator_name="mipw (greedy selection)",
                ),
                SelfNormalizedMarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions, estimator_name="snmipw"),
            ],
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            estimated_pscore=estimated_pscore,
            estimated_importance_weights=estimated_importance_weights_dict,
            action_embed=bandit_feedback_test["action_embed"],
            pi_b=bandit_feedback_test["pi_b"],
            metric="relative-ee",
        )

        return relative_ee_i
コード例 #9
0
def test_synthetic_obtain_batch_bandit_feedback():
    # n_rounds
    with pytest.raises(ValueError):
        dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds=0)

    with pytest.raises(TypeError):
        dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2)
        dataset.obtain_batch_bandit_feedback(n_rounds="3")

    # bandit feedback
    n_rounds = 10
    n_actions = 5
    n_cat_dim = 3
    n_cat_per_dim = 5
    for n_deficient_actions in [0, 2]:
        dataset = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            beta=0,
            n_cat_per_dim=n_cat_per_dim,
            n_cat_dim=n_cat_dim,
            reward_function=logistic_reward_function,
            n_deficient_actions=n_deficient_actions,
        )
        bandit_feedback = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        assert bandit_feedback["n_rounds"] == n_rounds
        assert bandit_feedback["n_actions"] == n_actions
        assert (bandit_feedback["context"].shape[0] == n_rounds  # n_rounds
                and
                bandit_feedback["context"].shape[1] == 1  # default dim_context
                )
        assert (bandit_feedback["action_context"].shape[0] == n_actions
                and bandit_feedback["action_context"].shape[1] == n_cat_dim)
        assert (bandit_feedback["action_embed"].shape[0] == n_rounds
                and bandit_feedback["action_embed"].shape[1] == n_cat_dim)
        assert (bandit_feedback["action"].ndim == 1
                and len(bandit_feedback["action"]) == n_rounds)
        assert bandit_feedback["position"] is None
        assert (bandit_feedback["reward"].ndim == 1
                and len(bandit_feedback["reward"]) == n_rounds)
        assert (bandit_feedback["expected_reward"].shape[0] == n_rounds
                and bandit_feedback["expected_reward"].shape[1] == n_actions)
        assert (bandit_feedback["q_x_e"].shape[0] == n_rounds
                and bandit_feedback["q_x_e"].shape[1] == n_cat_per_dim
                and bandit_feedback["q_x_e"].shape[2] == n_cat_dim)
        assert (bandit_feedback["p_e_a"].shape[0] == n_actions
                and bandit_feedback["p_e_a"].shape[1] == n_cat_per_dim
                and bandit_feedback["p_e_a"].shape[2] == n_cat_dim)
        assert (bandit_feedback["pi_b"].shape[0] == n_rounds
                and bandit_feedback["pi_b"].shape[1] == n_actions)
        # when `beta=0`, behavior_policy should be uniform
        if n_deficient_actions == 0:
            uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions
            assert np.allclose(bandit_feedback["pi_b"], uniform_policy)
        assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1),
                           np.ones(n_rounds))
        assert (bandit_feedback["pi_b"] == 0
                ).sum() == n_deficient_actions * n_rounds
        assert (bandit_feedback["pscore"].ndim == 1
                and len(bandit_feedback["pscore"]) == n_rounds)