def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # ipw estimtors can be used without estimated_rewards_by_reg_model for estimator in [ipw, snipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'pscore'" ), ): _ = estimator.estimate_policy_value(**input_dict) input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) for estimator in [ipw, snipw]: estimated_policy_value = estimator.estimate_policy_value_tensor( **input_tensor_dict ) assert isinstance( estimated_policy_value, torch.Tensor ), f"invalid type response: {estimator}" # remove necessary keys del input_tensor_dict["reward"] del input_tensor_dict["pscore"] del input_tensor_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value_tensor() missing 3 required positional arguments: 'reward', 'action', and 'pscore'" ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)
def test_dr_shrinkage_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the dr shrinkage estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dm_value = dm.estimate_policy_value(**input_dict) dr_value = dr.estimate_policy_value(**input_dict) dr_shrink_0_value = dr_shrink_0.estimate_policy_value(**input_dict) dr_shrink_max_value = dr_shrink_max.estimate_policy_value(**input_dict) assert ( dm_value == dr_shrink_0_value ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod" assert ( np.abs(dr_value - dr_shrink_max_value) < 1e-5 ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust" # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) dm_value = dm.estimate_policy_value_tensor(**input_tensor_dict) dr_value = dr.estimate_policy_value_tensor(**input_tensor_dict) dr_shrink_0_value = dr_shrink_0.estimate_policy_value_tensor( **input_tensor_dict) dr_shrink_max_value = dr_shrink_max.estimate_policy_value_tensor( **input_tensor_dict) assert ( dm_value.item() == dr_shrink_0_value.item() ), "DoublyRobustWithShrinkage (lambda=0) should be the same as DirectMethod" assert ( np.abs(dr_value.item() - dr_shrink_max_value.item()) < 1e-5 ), "DoublyRobustWithShrinkage (lambda=inf) should be almost the same as DoublyRobust"
def test_boundedness_of_snipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the boundedness of snipw estimators using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare snipw snipw = SelfNormalizedInverseProbabilityWeighting() # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # make pscore too small (to check the boundedness of snipw) input_dict["pscore"] = input_dict["pscore"] ** 3 estimated_policy_value = snipw.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}" # ipw with estimated pscore snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=True ) input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] estimated_policy_value = snipw_estimated_pscore.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of snipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}"
def test_switch_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the switch_dr using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dm_value = dm.estimate_policy_value(**input_dict) dr_value = dr.estimate_policy_value(**input_dict) switch_dr_0_value = switch_dr_0.estimate_policy_value(**input_dict) switch_dr_max_value = switch_dr_max.estimate_policy_value(**input_dict) assert (dm_value == switch_dr_0_value ), "SwitchDR (tau=0) should be the same as DirectMethod" assert (dr_value == switch_dr_max_value ), "SwitchDR (tau=1e10) should be the same as DoublyRobust"
def test_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # dr estimtors require all arguments for estimator in dr_estimators: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_sg_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the switch_dr using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward dr_value = dr.estimate_policy_value(**input_dict) sg_dr_0_value = sg_dr_0.estimate_policy_value(**input_dict) assert (dr_value == sg_dr_0_value ), "SG-DR (lambda=0) should be the same as DoublyRobust" input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] dr_value_estimated_pscore = dr_estimated_pscore.estimate_policy_value( **input_dict) assert ( dr_value == dr_value_estimated_pscore ), "DoublyRobust with estimated_pscore (which is the same as pscore) should be the same as DoublyRobust"
def test_boundedness_of_sndr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the boundedness of sndr estimators using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # make pscore too small (to check the boundedness of sndr) input_dict["pscore"] = input_dict["pscore"]**3 estimated_policy_value = sndr.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 2 ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value}" # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) # make pscore too small (to check the boundedness of sndr) input_tensor_dict["pscore"] = input_tensor_dict["pscore"]**3 estimated_policy_value = sndr.estimate_policy_value_tensor( **input_tensor_dict) assert ( estimated_policy_value.item() <= 2 ), f"estimated policy value of sndr should be smaller than or equal to 2 (because of its 2-boundedness), but the value is: {estimated_policy_value.item()}"
def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [ipw, snipw, ipw_tuning_mse, ipw_tuning_slope]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # ipw with estimated pscore ipw_estimated_pscore = InverseProbabilityWeighting(use_estimated_pscore=True) snipw_estimated_pscore = SelfNormalizedInverseProbabilityWeighting( use_estimated_pscore=True ) ipw_tuning_estimated_pscore = InverseProbabilityWeightingTuning( lambdas=[10, 1000], use_estimated_pscore=True ) input_dict["estimated_pscore"] = input_dict["pscore"] del input_dict["pscore"] for estimator in [ ipw_estimated_pscore, snipw_estimated_pscore, ipw_tuning_estimated_pscore, ]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [ipw, snipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_dr_using_random_evaluation_policy( synthetic_multi_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_multi_bandit_feedback[ "expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_multi_bandit_feedback.items() if k in [ "reward", "action", "pscore", "pscore_avg", "stratum_idx", "position" ] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward naive_dr = NaiveDR() bal_dr = BalDR() weighted_dr = WeightedDR() # dr estimators require all arguments for estimator in [naive_dr, bal_dr, weighted_dr]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" naive_dr = NaiveDR(use_estimated_pscore=True) bal_dr = BalDR(use_estimated_pscore=True) weighted_dr = WeightedDR(use_estimated_pscore=True) input_dict["estimated_pscore"] = input_dict["pscore"] input_dict["estimated_pscore_avg"] = input_dict["pscore"] # dr estimators require all arguments for estimator in [naive_dr, bal_dr, weighted_dr]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in [naive_dr, bal_dr, weighted_dr]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_ipw_using_random_evaluation_policy( synthetic_multi_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_multi_bandit_feedback.items() if k in [ "reward", "action", "pscore", "pscore_avg", "stratum_idx", "position" ] } input_dict["action_dist"] = action_dist naive_ipw = NaiveIPW() bal_ipw = BalIPW() weighted_ipw = WeightedIPW() # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [naive_ipw, bal_ipw, weighted_ipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # ipw with estimated pscore naive_ipw = NaiveIPW(use_estimated_pscore=True) bal_ipw = BalIPW(use_estimated_pscore=True) weighted_ipw = WeightedIPW(use_estimated_pscore=True) input_dict["estimated_pscore"] = input_dict["pscore"] input_dict["estimated_pscore_avg"] = input_dict["pscore"] del input_dict["pscore"] del input_dict["pscore_avg"] for estimator in [naive_ipw, bal_ipw, weighted_ipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [naive_ipw, weighted_ipw, bal_ipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_dm_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the performance of the direct method using synthetic bandit data and random evaluation policy """ expected_reward = np.expand_dims( synthetic_bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() # prepare dm dm = DirectMethod() # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # estimated_rewards_by_reg_model is required with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 1 required positional argument: 'estimated_rewards_by_reg_model'" ), ): _ = dm.estimate_policy_value(**input_dict) # add estimated_rewards_by_reg_model input_dict["estimated_rewards_by_reg_model"] = expected_reward # check expectation estimated_policy_value = dm.estimate_policy_value(**input_dict) assert (gt_mean == estimated_policy_value ), "DM should be perfect when the regression model is perfect" # remove unnecessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] estimated_policy_value = dm.estimate_policy_value(**input_dict) assert (gt_mean == estimated_policy_value ), "DM should be perfect when the regression model is perfect"
def test_ipw_using_random_evaluation_policy( synthetic_bandit_feedback_with_embed: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the format of ipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback_with_embed.items() if k in ["reward", "action", "pi_b", "action_embed", "context", "position"] } input_dict["action_dist"] = action_dist mipw = MIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"]) mipw_exact = MIPW( n_actions=synthetic_bandit_feedback_with_embed["n_actions"], embedding_selection_method="exact", ) mipw_greedy = MIPW( n_actions=synthetic_bandit_feedback_with_embed["n_actions"], embedding_selection_method="greedy", ) snmipw = SNMIPW(n_actions=synthetic_bandit_feedback_with_embed["n_actions"]) # ipw estimators can be used without estimated_rewards_by_reg_model for estimator in [mipw, mipw_exact, mipw_greedy, snmipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance( estimated_policy_value, float ), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["action"] for estimator in [mipw, snmipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 2 required positional arguments: 'reward' and 'action'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_bipw_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of bipw variants using synthetic bandit data and random evaluation policy """ action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist # insert dummy values input_dict["estimated_importance_weights"] = np.ones(action_dist.shape[0]) # check responce for estimator in [bipw]: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # make estimated_importance_weights too small (to check the boundedness of snbipw) input_dict["estimated_importance_weights"] = input_dict["pscore"]**3 estimated_policy_value = bipw.estimate_policy_value(**input_dict) assert ( estimated_policy_value <= 1 ), f"estimated policy value of bipw should be smaller than or equal to 1 (because of its 1-boundedness), but the value is: {estimated_policy_value}" # remove necessary keys del input_dict["reward"] del input_dict["action"] del input_dict["estimated_importance_weights"] for estimator in [bipw]: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 3 required positional arguments: 'reward', 'action', and 'estimated_importance_weights'" ), ): _ = estimator.estimate_policy_value(**input_dict)
def test_dr_using_random_evaluation_policy( synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray) -> None: """ Test the format of dr variants using synthetic bandit data and random evaluation policy """ expected_reward = synthetic_bandit_feedback["expected_reward"][:, :, np.newaxis] action_dist = random_action_dist # prepare input dict input_dict = { k: v for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_dict["action_dist"] = action_dist input_dict["estimated_rewards_by_reg_model"] = expected_reward # dr estimtors require all arguments for estimator in dr_estimators: estimated_policy_value = estimator.estimate_policy_value(**input_dict) assert isinstance(estimated_policy_value, float), f"invalid type response: {estimator}" # remove necessary keys del input_dict["reward"] del input_dict["pscore"] del input_dict["action"] del input_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value(**input_dict) # prepare input dict input_tensor_dict = { k: v if v is None else torch.from_numpy(v) for k, v in synthetic_bandit_feedback.items() if k in ["reward", "action", "pscore", "position"] } input_tensor_dict["action_dist"] = torch.from_numpy(action_dist) input_tensor_dict["estimated_rewards_by_reg_model"] = torch.from_numpy( expected_reward) # dr estimtors require all arguments for estimator in dr_estimators: if estimator.estimator_name == "switch-dr": with pytest.raises( NotImplementedError, match=re.escape( "This is not implemented for Swtich-DR because it is indifferentiable." ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict) else: estimated_policy_value = estimator.estimate_policy_value_tensor( **input_tensor_dict) assert isinstance( estimated_policy_value, torch.Tensor), f"invalid type response: {estimator}" # remove necessary keys del input_tensor_dict["reward"] del input_tensor_dict["pscore"] del input_tensor_dict["action"] del input_tensor_dict["estimated_rewards_by_reg_model"] for estimator in dr_estimators: if estimator.estimator_name == "switch-dr": with pytest.raises( NotImplementedError, match=re.escape( "This is not implemented for Swtich-DR because it is indifferentiable." ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict) else: with pytest.raises( TypeError, match=re.escape( "estimate_policy_value_tensor() missing 4 required positional arguments: 'reward', 'action', 'pscore', and 'estimated_rewards_by_reg_model'" ), ): _ = estimator.estimate_policy_value_tensor(**input_tensor_dict)