def test_dr_init_using_valid_input_data(lambda_: float, description: str) -> None: _ = DoublyRobust(lambda_=lambda_) _ = DoublyRobustWithShrinkage(lambda_=lambda_) _ = SwitchDoublyRobust(lambda_=lambda_) if lambda_ < np.inf: _ = SubGaussianDoublyRobust(lambda_=lambda_)
def test_dr_init_using_invalid_inputs( lambda_, use_estimated_pscore, err, description, ): with pytest.raises(err, match=f"{description}*"): _ = DoublyRobust(lambda_=lambda_, use_estimated_pscore=use_estimated_pscore) with pytest.raises(err, match=f"{description}*"): _ = SwitchDoublyRobust(lambda_=lambda_, use_estimated_pscore=use_estimated_pscore) with pytest.raises(err, match=f"{description}*"): _ = DoublyRobustWithShrinkage( lambda_=lambda_, use_estimated_pscore=use_estimated_pscore)
def test_dr_using_invalid_input_data( action_dist: np.ndarray, action: np.ndarray, reward: np.ndarray, pscore: np.ndarray, position: np.ndarray, estimated_rewards_by_reg_model: np.ndarray, use_estimated_pscore: bool, estimated_pscore: np.ndarray, description: str, ) -> None: dr = DoublyRobust(use_estimated_pscore=use_estimated_pscore) dr_tuning = DoublyRobustTuning( lambdas=[1, 100], estimator_name="dr_tuning", use_estimated_pscore=use_estimated_pscore, ) sndr = SelfNormalizedDoublyRobust( use_estimated_pscore=use_estimated_pscore) # estimate_intervals function raises ValueError of all estimators for estimator in [dr, sndr, dr_tuning]: with pytest.raises(ValueError, match=f"{description}*"): _ = estimator.estimate_policy_value( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, ) with pytest.raises(ValueError, match=f"{description}*"): _ = estimator.estimate_interval( action_dist=action_dist, action=action, reward=reward, pscore=pscore, position=position, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, )
# hyperparameter for the regression model used in model dependent OPE estimators with open("./conf/hyperparams.yaml", "rb") as f: hyperparams = yaml.safe_load(f) base_model_dict = dict( logistic_regression=LogisticRegression, lightgbm=HistGradientBoostingClassifier, random_forest=RandomForestClassifier, ) # compared OPE estimators ope_estimators = [ DirectMethod(), InverseProbabilityWeighting(), SelfNormalizedInverseProbabilityWeighting(), DoublyRobust(), SelfNormalizedDoublyRobust(), SwitchDoublyRobust(tau=1.0, estimator_name="switch-dr (tau=1)"), SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"), DoublyRobustWithShrinkage(lambda_=1.0, estimator_name="dr-os (lambda=1)"), DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"), ] if __name__ == "__main__": parser = argparse.ArgumentParser( description="evaluate off-policy estimators with synthetic bandit data." ) parser.add_argument( "--n_runs", type=int, default=1, help="number of simulations in the experiment." ) parser.add_argument(
data_path = Path("../open_bandit_dataset") obd = OpenBanditDataset( behavior_policy=behavior_policy, campaign=campaign, data_path=data_path ) # hyparparameters for counterfactual policies kwargs = dict( n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state ) if counterfactual_policy == "bts": kwargs["alpha"] = production_prior_for_bts[campaign]["alpha"] kwargs["beta"] = production_prior_for_bts[campaign]["beta"] kwargs["batch_size"] = production_batch_size_for_bts[campaign] policy = counterfactual_policy_dict[counterfactual_policy](**kwargs) # compared OPE estimators ope_estimators = [DirectMethod(), InverseProbabilityWeighting(), DoublyRobust()] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(LogisticRegression(**hyperparams)) # ground-truth policy value of a counterfactual policy # , which is estimated with factual (observed) rewards (on-policy estimation) ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy=counterfactual_policy, campaign=campaign, data_path=data_path ) evaluation_of_ope_results = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators } for b in np.arange(n_boot_samples): # sample bootstrap from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # run a counterfactual bandit algorithm on logged bandit feedback data
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) ope_estimator = DoublyRobust() # define evaluation policy using NNPolicyLearner nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator.estimate_policy_value_tensor, ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=12345, ) # train the evaluation policy on the training set of the synthetic logged bandit feedback nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback nn_policy_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_nn_policy_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
import pytest import numpy as np from obp.types import BanditFeedback from obp.ope import ( DirectMethod, DoublyRobust, DoublyRobustWithShrinkage, SwitchDoublyRobust, SelfNormalizedDoublyRobust, ) from conftest import generate_action_dist # prepare instances dm = DirectMethod() dr = DoublyRobust() dr_shrink_0 = DoublyRobustWithShrinkage(lambda_=0.0) dr_shrink_max = DoublyRobustWithShrinkage(lambda_=1e10) sndr = SelfNormalizedDoublyRobust() switch_dr_0 = SwitchDoublyRobust(tau=0.0) switch_dr_max = SwitchDoublyRobust(tau=1e10) dr_estimators = [dr, dr_shrink_0, sndr, switch_dr_0] # dr and self-normalized dr # action_dist, action, reward, pscore, position, estimated_rewards_by_reg_model, description invalid_input_of_dr = [ ( generate_action_dist(5, 4, 3), None, np.zeros(5, dtype=int),
lambdas=lambdas, tuning_method=tuning_method, ) _ = SwitchDoublyRobustTuning( lambdas=lambdas, tuning_method=tuning_method, ) _ = SubGaussianDoublyRobustTuning( lambdas=lambdas, tuning_method=tuning_method, ) # prepare instances dm = DirectMethod() dr = DoublyRobust() dr_tuning_mse = DoublyRobustTuning(lambdas=[1, 100], tuning_method="mse", estimator_name="dr_tuning_mse") dr_tuning_slope = DoublyRobustTuning(lambdas=[1, 100], tuning_method="slope", estimator_name="dr_tuning_slope") dr_os_0 = DoublyRobustWithShrinkage(lambda_=0.0) dr_os_tuning_mse = DoublyRobustWithShrinkageTuning( lambdas=[1, 100], tuning_method="mse", estimator_name="dr_os_tuning_mse") dr_os_tuning_slope = DoublyRobustWithShrinkageTuning( lambdas=[1, 100], tuning_method="slope", estimator_name="dr_os_tuning_slope") dr_os_max = DoublyRobustWithShrinkage(lambda_=np.inf) sndr = SelfNormalizedDoublyRobust()
OffPolicyEvaluation, InverseProbabilityWeighting, SelfNormalizedInverseProbabilityWeighting, DirectMethod, DoublyRobust, SelfNormalizedDoublyRobust, SwitchDoublyRobust, DoublyRobustWithShrinkage, ) # OPE estimators compared ope_estimators = [ DirectMethod(), InverseProbabilityWeighting(), SelfNormalizedInverseProbabilityWeighting(), DoublyRobust(), DoublyRobust(estimator_name="mrdr"), SelfNormalizedDoublyRobust(), SwitchDoublyRobust(tau=5.0, estimator_name="switch-dr (tau=5)"), SwitchDoublyRobust(tau=10.0, estimator_name="switch-dr (tau=10)"), SwitchDoublyRobust(tau=50.0, estimator_name="switch-dr (tau=50)"), SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"), SwitchDoublyRobust(tau=500.0, estimator_name="switch-dr (tau=500)"), SwitchDoublyRobust(tau=1000.0, estimator_name="switch-dr (tau=1000)"), DoublyRobustWithShrinkage(lambda_=5.0, estimator_name="dr-os (lambda=5)"), DoublyRobustWithShrinkage(lambda_=10.0, estimator_name="dr-os (lambda=10)"), DoublyRobustWithShrinkage(lambda_=50.0, estimator_name="dr-os (lambda=50)"), DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"),
def main(cfg: DictConfig) -> None: print(cfg) logger.info(f"The current working directory is {Path().cwd()}") start_time = time.time() logger.info("initializing experimental condition..") # compared ope estimators lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"]) ope_estimators = [ InverseProbabilityWeighting(estimator_name="IPW"), SelfNormalizedInverseProbabilityWeighting(estimator_name="SNIPW"), DirectMethod(estimator_name="DM"), DoublyRobust(estimator_name="DR"), SelfNormalizedDoublyRobust(estimator_name="SNDR"), SwitchDoublyRobustTuning(lambdas=lambdas, estimator_name="Switch-DR"), DoublyRobustWithShrinkageTuning(lambdas=lambdas, estimator_name="DRos"), ] # configurations n_seeds = cfg.setting.n_seeds sample_size = cfg.setting.sample_size reg_model = cfg.setting.reg_model campaign = cfg.setting.campaign behavior_policy = cfg.setting.behavior_policy test_size = cfg.setting.test_size is_timeseries_split = cfg.setting.is_timeseries_split n_folds = cfg.setting.n_folds obd_path = (Path().cwd().parents[5] / "open_bandit_dataset" if cfg.setting.is_full_obd else None) random_state = cfg.setting.random_state np.random.seed(random_state) # define dataset dataset_ts = OpenBanditDataset(behavior_policy="bts", campaign=campaign, data_path=obd_path) dataset_ur = OpenBanditDataset(behavior_policy="random", campaign=campaign, data_path=obd_path) # prepare logged bandit feedback and evaluation policies if behavior_policy == "random": if is_timeseries_split: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ur] # obtain the ground-truth policy value ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ts = BernoulliTS( n_actions=dataset_ts.n_actions, len_list=dataset_ts.len_list, random_state=random_state, is_zozotown_prior=True, campaign=campaign, ) action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ts, action_dist_ts)] else: if is_timeseries_split: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ts] # obtain the ground-truth policy value ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="random", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ur = Random( n_actions=dataset_ur.n_actions, len_list=dataset_ur.len_list, random_state=random_state, ) action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ur, action_dist_ur)] # regression models used in ope estimators hyperparams = dict(cfg.reg_model_hyperparams)[reg_model] regression_models = [reg_model_dict[reg_model](**hyperparams)] # define an evaluator class evaluator = InterpretableOPEEvaluator( random_states=np.arange(n_seeds), bandit_feedbacks=bandit_feedbacks, evaluation_policies=evaluation_policies, ope_estimators=ope_estimators, regression_models=regression_models, ) # conduct an evaluation of OPE experiment logger.info("experiment started") _ = evaluator.estimate_policy_value(sample_size=sample_size, n_folds_=n_folds) # calculate statistics mean = evaluator.calculate_mean(root=True) mean_scaled = evaluator.calculate_mean(scale=True, root=True) # save results of the evaluation of off-policy estimators log_path = Path("./outputs") log_path.mkdir(exist_ok=True, parents=True) # save root mse root_mse_df = DataFrame() root_mse_df["estimator"] = list(mean.keys()) root_mse_df["mean"] = list(mean.values()) root_mse_df["mean(scaled)"] = list(mean_scaled.values()) root_mse_df.to_csv(log_path / "root_mse.csv") # conduct pairwise t-tests se_df = DataFrame(evaluator.calculate_squared_error()) se_df = DataFrame(se_df.stack()).reset_index(1) se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True) nonparam_ttests = (pg.pairwise_ttests( data=se_df, dv="se", parametric=False, between="estimators", ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1)) nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv") # save reg model metrics DataFrame(evaluator.reg_model_metrics).describe().to_csv( log_path / "reg_model_metrics.csv") # print result print(root_mse_df) experiment = f"{campaign}-{behavior_policy}-{sample_size}" elapsed_time = np.round((time.time() - start_time) / 60, 2) logger.info(f"finish experiment {experiment} in {elapsed_time}min")