def test_synthetic_sample_reward_using_invalid_inputs(context, action, description): n_actions = 10 dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions) with pytest.raises(ValueError, match=f"{description}*"): _ = dataset.sample_reward(context=context, action=action)
def test_synthetic_sample_reward_using_valid_inputs(context, action, description): n_actions = 10 dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions, dim_context=3) reward = dataset.sample_reward(context=context, action=action) assert isinstance(reward, np.ndarray), "Invalid response of sample_reward" assert reward.shape == action.shape, "Invalid response of sample_reward"
def test_synthetic_calc_policy_value_using_invalid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions) with pytest.raises(ValueError, match=f"{description}*"): _ = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist)
def test_synthetic_calc_policy_value_using_valid_inputs( expected_reward, action_dist, description, ): n_actions = 10 dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=n_actions) policy_value = dataset.calc_ground_truth_policy_value( expected_reward=expected_reward, action_dist=action_dist) assert isinstance( policy_value, float), "Invalid response of calc_ground_truth_policy_value"
def test_synthetic_init_using_invalid_inputs( n_actions, dim_context, reward_type, reward_std, beta, n_cat_per_dim, latent_param_mat_dim, n_cat_dim, p_e_a_param_std, n_unobserved_cat_dim, n_irrelevant_cat_dim, n_deficient_actions, action_context, random_state, err, description, ): with pytest.raises(err, match=f"{description}*"): _ = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, reward_type=reward_type, reward_std=reward_std, beta=beta, n_deficient_actions=n_deficient_actions, n_cat_per_dim=n_cat_per_dim, latent_param_mat_dim=latent_param_mat_dim, n_cat_dim=n_cat_dim, p_e_a_param_std=p_e_a_param_std, n_unobserved_cat_dim=n_unobserved_cat_dim, n_irrelevant_cat_dim=n_irrelevant_cat_dim, action_context=action_context, random_state=random_state, )
def synthetic_bandit_feedback_with_embed() -> BanditFeedback: n_actions = 10 dim_context = 5 n_cat_dim = 3 n_cat_per_dim = 5 random_state = 12345 n_rounds = 10000 dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, n_cat_dim=n_cat_dim, n_cat_per_dim=n_cat_per_dim, reward_function=logistic_reward_function, random_state=random_state, ) bandit_feedback = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) return bandit_feedback
def test_synthetic_init(): # when reward_function is None, expected_reward is randomly sampled in [0, 1] # this check includes the test of `sample_contextfree_expected_reward` function dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2, beta=0) assert len(dataset.expected_reward) == 2 assert np.all(0 <= dataset.expected_reward) and np.all( dataset.expected_reward <= 1) # one-hot action_context when None is given ohe = np.eye(2, dtype=int) assert np.allclose(dataset.action_context, ohe)
def process(i: int): # synthetic data generator dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, beta=3.0, n_cat_dim=3, n_cat_per_dim=5, reward_function=logistic_reward_function, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_iw_estimator]( **hyperparams[base_model_for_iw_estimator]), ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit data evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit data action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], ) # estimate the reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=2, random_state=12345, ) # fit propensity score estimators pscore_estimator = PropensityScoreEstimator( len_list=1, n_actions=n_actions, base_model=base_model_dict[base_model_for_pscore_estimator]( **hyperparams[base_model_for_pscore_estimator]), calibration_cv=3, ) estimated_pscore = pscore_estimator.fit_predict( action=bandit_feedback_test["action"], position=bandit_feedback_test["position"], context=bandit_feedback_test["context"], n_folds=3, random_state=12345, ) # fit importance weight estimators estimated_importance_weights_dict = {} for clf_name, clf_arguments in bipw_model_configurations.items(): clf = ImportanceWeightEstimator( len_list=1, n_actions=n_actions, fitting_method=clf_arguments["fitting_method"], base_model=clf_arguments["base_model"], ) estimated_importance_weights_dict[clf_name] = clf.fit_predict( action=bandit_feedback_test["action"], context=bandit_feedback_test["context"], action_dist=action_dist, position=bandit_feedback_test["position"], n_folds=2, evaluate_model_performance=False, random_state=12345, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators + [ MarginalizedInverseProbabilityWeighting(n_actions=n_actions, estimator_name="mipw"), MarginalizedInverseProbabilityWeighting( n_actions=n_actions, embedding_selection_method="greedy", estimator_name="mipw (greedy selection)", ), SelfNormalizedMarginalizedInverseProbabilityWeighting( n_actions=n_actions, estimator_name="snmipw"), ], ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights_dict, action_embed=bandit_feedback_test["action_embed"], pi_b=bandit_feedback_test["pi_b"], metric="relative-ee", ) return relative_ee_i
def test_synthetic_obtain_batch_bandit_feedback(): # n_rounds with pytest.raises(ValueError): dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds=0) with pytest.raises(TypeError): dataset = SyntheticBanditDatasetWithActionEmbeds(n_actions=2) dataset.obtain_batch_bandit_feedback(n_rounds="3") # bandit feedback n_rounds = 10 n_actions = 5 n_cat_dim = 3 n_cat_per_dim = 5 for n_deficient_actions in [0, 2]: dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, beta=0, n_cat_per_dim=n_cat_per_dim, n_cat_dim=n_cat_dim, reward_function=logistic_reward_function, n_deficient_actions=n_deficient_actions, ) bandit_feedback = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) assert bandit_feedback["n_rounds"] == n_rounds assert bandit_feedback["n_actions"] == n_actions assert (bandit_feedback["context"].shape[0] == n_rounds # n_rounds and bandit_feedback["context"].shape[1] == 1 # default dim_context ) assert (bandit_feedback["action_context"].shape[0] == n_actions and bandit_feedback["action_context"].shape[1] == n_cat_dim) assert (bandit_feedback["action_embed"].shape[0] == n_rounds and bandit_feedback["action_embed"].shape[1] == n_cat_dim) assert (bandit_feedback["action"].ndim == 1 and len(bandit_feedback["action"]) == n_rounds) assert bandit_feedback["position"] is None assert (bandit_feedback["reward"].ndim == 1 and len(bandit_feedback["reward"]) == n_rounds) assert (bandit_feedback["expected_reward"].shape[0] == n_rounds and bandit_feedback["expected_reward"].shape[1] == n_actions) assert (bandit_feedback["q_x_e"].shape[0] == n_rounds and bandit_feedback["q_x_e"].shape[1] == n_cat_per_dim and bandit_feedback["q_x_e"].shape[2] == n_cat_dim) assert (bandit_feedback["p_e_a"].shape[0] == n_actions and bandit_feedback["p_e_a"].shape[1] == n_cat_per_dim and bandit_feedback["p_e_a"].shape[2] == n_cat_dim) assert (bandit_feedback["pi_b"].shape[0] == n_rounds and bandit_feedback["pi_b"].shape[1] == n_actions) # when `beta=0`, behavior_policy should be uniform if n_deficient_actions == 0: uniform_policy = np.ones_like(bandit_feedback["pi_b"]) / n_actions assert np.allclose(bandit_feedback["pi_b"], uniform_policy) assert np.allclose(bandit_feedback["pi_b"][:, :, 0].sum(1), np.ones(n_rounds)) assert (bandit_feedback["pi_b"] == 0 ).sum() == n_deficient_actions * n_rounds assert (bandit_feedback["pscore"].ndim == 1 and len(bandit_feedback["pscore"]) == n_rounds)