def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # ipw estimtors can be used without estimated_rewards_by_reg_model for estimator in [ipw, snipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'pscore'" ), ): _ = estimator.estimate_policy_value(**input_dict) input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) for estimator in [ipw, snipw]: estimated_policy_value = estimator.estimate_policy_value_tensor( **input_tensor_dict ) assert isinstance( estimated_policy_value, torch.Tensor ), f"invalid type response: {estimator}" # remove necessary keys del input_tensor_dict["reward"] del input_tensor_dict["pscore"] del input_tensor_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value_tensor() missing 3 required positional arguments: 'reward', 'action', and 'pscore'" ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
def test_dr_shrinkage_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the dr shrinkage estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dm_value = dm.estimate_policy_value(**input_dict) dr_value = dr.estimate_policy_value(**input_dict) dr_shrink_0_value = dr_shrink_0.estimate_policy_value(**input_dict) dr_shrink_max_value = dr_shrink_max.estimate_policy_value(**input_dict) assert ( dm_value == dr_shrink_0_value ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod" assert ( np.abs(dr_value - dr_shrink_max_value) < 1e-5 ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust" # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) dm_value = dm.estimate_policy_value_tensor(**input_tensor_dict) dr_value = dr.estimate_policy_value_tensor(**input_tensor_dict) dr_shrink_0_value = dr_shrink_0.estimate_policy_value_tensor( **input_tensor_dict) dr_shrink_max_value = dr_shrink_max.estimate_policy_value_tensor( **input_tensor_dict) assert ( dm_value.item() == dr_shrink_0_value.item() ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod" assert ( np.abs(dr_value.item() - dr_shrink_max_value.item()) < 1e-5 ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"
def test_switch_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the switch_dr using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dm_value = dm.estimate_policy_value(**input_dict) dr_value = dr.estimate_policy_value(**input_dict) switch_dr_0_value = switch_dr_0.estimate_policy_value(**input_dict) switch_dr_max_value = switch_dr_max.estimate_policy_value(**input_dict) assert (dm_value == switch_dr_0_value ), "SwitchDR (tau=0) should be the same as DirectMethod" assert (dr_value == switch_dr_max_value ), "SwitchDR (tau=1e10) should be the same as DoublyRobust"
def test_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # dr estimtors require all arguments for estimator in dr_estimators: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_sg_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the switch_dr using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dr_value = dr.estimate_policy_value(**input_dict) sg_dr_0_value = sg_dr_0.estimate_policy_value(**input_dict) assert (dr_value == sg_dr_0_value ), "SG-DR (lambda=0) should be the same as DoublyRobust" input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] dr_value_estimated_pscore = dr_estimated_pscore.estimate_policy_value( **input_dict) assert ( dr_value == dr_value_estimated_pscore ), "DoublyRobust with estimated_pscore (which is the same as pscore) should be the same as DoublyRobust"
def test_boundedness_of_snipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the boundedness of snipw estimators using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare snipw snipw = SelfNormalizedInverseProbabilityWeighting() # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # make pscore too small (to check the boundedness of snipw) input_dict["pscore"] = input_dict["pscore"] ** 3 estimated_policy_value = snipw.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}" # ipw with estimated pscore snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=True ) input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] estimated_policy_value = snipw_estimated_pscore.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"
def test_boundedness_of_sndr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the boundedness of sndr estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # make pscore too small (to check the boundedness of sndr) input_dict["pscore"] = input_dict["pscore"]**3 estimated_policy_value = sndr.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 2 ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value}" # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) # make pscore too small (to check the boundedness of sndr) input_tensor_dict["pscore"] = input_tensor_dict["pscore"]**3 estimated_policy_value = sndr.estimate_policy_value_tensor( **input_tensor_dict) assert ( estimated_policy_value.item() <= 2 ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value.item()}"
def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [ipw, snipw, ipw_tuning_mse, ipw_tuning_slope]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # ipw with estimated pscore ipw_estimated_pscore = InverseProbabilityWeighting(use_estimated_pscore=True) snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=True ) ipw_tuning_estimated_pscore = InverseProbabilityWeightingTuning( lambdas=[10, 1000], use_estimated_pscore=True ) input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] for estimator in [ ipw_estimated_pscore, snipw_estimated_pscore, ipw_tuning_estimated_pscore, ]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_dr_using_random_evaluation_policy( synthetic_multi_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_multi_bandit_feedback[ "expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_multi_bandit_feedback.items() if k in [ "reward", "action", "pscore", "pscore_avg", "stratum_idx", "position" ] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward naive_dr = NaiveDR() bal_dr = BalDR() weighted_dr = WeightedDR() # dr estimators require all arguments for estimator in [naive_dr, bal_dr, weighted_dr]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" naive_dr = NaiveDR(use_estimated_pscore=True) bal_dr = BalDR(use_estimated_pscore=True) weighted_dr = WeightedDR(use_estimated_pscore=True) input_dict["estimated_pscore"] = input_dict["pscore"] input_dict["estimated_pscore_avg"] = input_dict["pscore"] # dr estimators require all arguments for estimator in [naive_dr, bal_dr, weighted_dr]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in [naive_dr, bal_dr, weighted_dr]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_fixture( synthetic_bandit_feedback: BanditFeedback, expected_reward_0: np.ndarray, feedback_key_set: Set[str], random_action_dist: np.ndarray, ) -> None: """ Check the validity of the fixture data generated by conftest.py """ np.testing.assert_array_almost_equal( expected_reward_0, synthetic_bandit_feedback["expected_reward"][0]) assert feedback_key_set == set( synthetic_bandit_feedback.keys() ), f"Key set of bandit feedback should be {feedback_key_set}, but {synthetic_bandit_feedback.keys()}"
def test_performance_of_binary_outcome_models( fixed_synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy when the importance weight estimator is estimated by a logistic regression """ bandit_feedback = fixed_synthetic_bandit_feedback.copy() action_dist = random_action_dist random_state = 12345 auc_scores: Dict[str, float] = {} fit_methods = ["sample", "raw"] for fit_method in fit_methods: for model_name, model in binary_model_dict.items(): importance_weight_estimator = ImportanceWeightEstimator( n_actions=bandit_feedback["n_actions"], action_context=bandit_feedback["action_context"], base_model=model(**hyperparams[model_name]), fitting_method=fit_method, len_list=1, ) # train importance weight estimator on logged bandit feedback data estimated_importance_weight = importance_weight_estimator.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], action_dist=action_dist, n_folds=2, # 2-fold cross-fitting random_state=random_state, evaluate_model_performance=True, ) assert np.all(estimated_importance_weight >= 0 ), "estimated_importance_weight must be non-negative" # extract predictions tmp_y = [] tmp_pred = [] for i in range(len(importance_weight_estimator.eval_result["y"])): tmp_y.append(importance_weight_estimator.eval_result["y"][i]) tmp_pred.append( importance_weight_estimator.eval_result["proba"][i]) y_test = np.array(tmp_y).flatten() y_pred = np.array(tmp_pred).flatten() auc_scores[model_name + "_" + fit_method] = roc_auc_score( y_true=y_test, y_score=y_pred, ) for model_name in auc_scores: print(f"AUC of {model_name} is {auc_scores[model_name]}") assert (auc_scores[model_name] > 0.5), f"AUC of {model_name} should be greater than 0.5"
def test_ipw_using_random_evaluation_policy( synthetic_multi_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_multi_bandit_feedback.items() if k in [ "reward", "action", "pscore", "pscore_avg", "stratum_idx", "position" ] } input_dict["action_dist"] = action_dist naive_ipw = NaiveIPW() bal_ipw = BalIPW() weighted_ipw = WeightedIPW() # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [naive_ipw, bal_ipw, weighted_ipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # ipw with estimated pscore naive_ipw = NaiveIPW(use_estimated_pscore=True) bal_ipw = BalIPW(use_estimated_pscore=True) weighted_ipw = WeightedIPW(use_estimated_pscore=True) input_dict["estimated_pscore"] = input_dict["pscore"] input_dict["estimated_pscore_avg"] = input_dict["pscore"] del input_dict["pscore"] del input_dict["pscore_avg"] for estimator in [naive_ipw, bal_ipw, weighted_ipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [naive_ipw, weighted_ipw, bal_ipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_dm_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the performance of the direct method using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() # prepare dm dm = DirectMethod() # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # estimated_rewards_by_reg_model is required with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 1 required positional argument: 'estimated_rewards_by_reg_model'" ), ): _ = dm.estimate_policy_value(**input_dict) # add estimated_rewards_by_reg_model input_dict["estimated_rewards_by_reg_model"] = expected_reward # check expectation estimated_policy_value = dm.estimate_policy_value(**input_dict) assert (gt_mean == estimated_policy_value ), "DM should be perfect when the regression model is perfect" # remove unnecessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] estimated_policy_value = dm.estimate_policy_value(**input_dict) assert (gt_mean == estimated_policy_value ), "DM should be perfect when the regression model is perfect"
def test_performance_of_binary_outcome_models( fixed_synthetic_bandit_feedback: BanditFeedback, ) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy when the propensity score estimator is estimated by a logistic regression """ bandit_feedback = fixed_synthetic_bandit_feedback.copy() random_state = 12345 auc_scores: Dict[str, float] = {} for model_name, model in binary_model_dict.items(): propensity_score_estimator = PropensityScoreEstimator( n_actions=bandit_feedback["n_actions"], base_model=model(**hyperparams[model_name]), len_list=1, ) # train propensity score estimator on logged bandit feedback data estimated_propensity_score = propensity_score_estimator.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], n_folds=2, # 2-fold cross-fitting random_state=random_state, evaluate_model_performance=True, ) assert np.all( estimated_propensity_score >= 0 ), "estimated_propensity_score must be non-negative" # extract predictions tmp_y = [] tmp_pred = [] for i in range(len(propensity_score_estimator.eval_result["y"])): tmp_y.append(propensity_score_estimator.eval_result["y"][i]) tmp_pred.append(propensity_score_estimator.eval_result["proba"][i]) y_test = np.array(tmp_y).flatten() y_pred = np.array(tmp_pred).reshape(-1, tmp_pred[0].shape[1]) auc_scores[model_name] = roc_auc_score( y_true=y_test, y_score=y_pred, multi_class="ovo" ) for model_name in auc_scores: print(f"AUC (macro-ovo) of {model_name} is {auc_scores[model_name]}") assert ( auc_scores[model_name] > 0.5 ), f"AUC of {model_name} should be greater than 0.5"
def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback_with_embed: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback_with_embed.items() if k in ["reward", "action", "pi_b", "action_embed", "context", "position"] } input_dict["action_dist"] = action_dist mipw = MIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"]) mipw_exact = MIPW( n_actions=synthetic_bandit_feedback_with_embed["n_actions"], embedding_selection_method="exact", ) mipw_greedy = MIPW( n_actions=synthetic_bandit_feedback_with_embed["n_actions"], embedding_selection_method="greedy", ) snmipw = SNMIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"]) # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [mipw, mipw_exact, mipw_greedy, snmipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [mipw, snmipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_bipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of bipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # insert dummy values input_dict["estimated_importance_weights"] = np.ones(action_dist.shape[0]) # check responce for estimator in [bipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # make estimated_importance_weights too small (to check the boundedness of snbipw) input_dict["estimated_importance_weights"] = input_dict["pscore"]**3 estimated_policy_value = bipw.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of bipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}" # remove necessary keys del input_dict["reward"] del input_dict["action"] del input_dict["estimated_importance_weights"] for estimator in [bipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_importance_weights'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_performance_of_binary_outcome_models( fixed_synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy when the regression model is estimated by a logistic regression """ bandit_feedback = fixed_synthetic_bandit_feedback.copy() expected_reward = np.expand_dims(bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() random_state = 12345 auc_scores: Dict[str, float] = {} # check ground truth print(f"gt_mean: {gt_mean}") # check the performance of regression models using doubly robust criteria (|\hat{q} - q| <= |q| is satisfied with a high probability) dr_criteria_pass_rate = 0.8 fit_methods = ["normal", "iw", "mrdr"] for fit_method in fit_methods: for model_name, model in binary_model_dict.items(): regression_model = RegressionModel( n_actions=bandit_feedback["n_actions"], len_list=int(bandit_feedback["position"].max() + 1), action_context=bandit_feedback["action_context"], base_model=model(**hyperparams[model_name]), fitting_method=fit_method, ) if fit_method == "normal": # train regression model on logged bandit feedback data estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) else: # train regression model on logged bandit feedback data estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], pscore=bandit_feedback["pscore"], position=bandit_feedback["position"], action_dist=action_dist, n_folds=3, # 3-fold cross-fitting random_state=random_state, ) auc_scores[model_name + "_" + fit_method] = roc_auc_score( y_true=bandit_feedback["reward"], y_score=estimated_rewards_by_reg_model[ np.arange(bandit_feedback["reward"].shape[0]), bandit_feedback["action"], bandit_feedback["position"], ], ) # compare dr criteria dr_criteria = np.abs((gt_mean - estimated_rewards_by_reg_model)) - np.abs( gt_mean ) print( f"Dr criteria is satisfied with probability {np.mean(dr_criteria <= 0)} ------ model: {model_name} ({fit_method})," ) assert ( np.mean(dr_criteria <= 0) >= dr_criteria_pass_rate ), f" should be satisfied with a probability at least {dr_criteria_pass_rate}" for model_name in auc_scores: print(f"AUC of {model_name} is {auc_scores[model_name]}") assert ( auc_scores[model_name] > 0.5 ), f"AUC of {model_name} should be greater than 0.5"
def test_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # dr estimtors require all arguments for estimator in dr_estimators: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict) # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) # dr estimtors require all arguments for estimator in dr_estimators: if estimator.estimator_name == "switch-dr": with pytest.raises( NotImplementedError, match=re.escape( "This is not implemented for Swtich-DR because it is indifferentiable." ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict) else: estimated_policy_value = estimator.estimate_policy_value_tensor( **input_tensor_dict) assert isinstance( estimated_policy_value, torch.Tensor), f"invalid type response: {estimator}" # remove necessary keys del input_tensor_dict["reward"] del input_tensor_dict["pscore"] del input_tensor_dict["action"] del input_tensor_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: if estimator.estimator_name == "switch-dr": with pytest.raises( NotImplementedError, match=re.escape( "This is not implemented for Swtich-DR because it is indifferentiable." ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict) else: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value_tensor() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)