def test_real_init(): # behavior_policy with pytest.raises(ValueError): OpenBanditDataset(behavior_policy="aaa", campaign="all") # campaign with pytest.raises(ValueError): OpenBanditDataset(behavior_policy="random", campaign="aaa") # data_path with pytest.raises(ValueError): OpenBanditDataset(behavior_policy="random", campaign="all", data_path=5) # load_raw_data obd = OpenBanditDataset(behavior_policy="random", campaign="all") # check the value exists and has the right type assert (isinstance(obd.data, pd.DataFrame) and isinstance(obd.item_context, pd.DataFrame) and isinstance(obd.action, np.ndarray) and isinstance(obd.position, np.ndarray) and isinstance(obd.reward, np.ndarray) and isinstance(obd.pscore, np.ndarray)) # pre_process (context and action_context) assert isinstance(obd.context, np.ndarray) and isinstance( obd.action_context, np.ndarray)
def test_sample_bootstrap_bandit_feedback(): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() assert len(bandit_feedback["action"]) == len(bootstrap_bf["action"]) assert len(bandit_feedback["position"]) == len(bootstrap_bf["position"]) assert len(bandit_feedback["reward"]) == len(bootstrap_bf["reward"]) assert len(bandit_feedback["pscore"]) == len(bootstrap_bf["pscore"]) assert len(bandit_feedback["context"]) == len(bootstrap_bf["context"])
def test_sample_bootstrap_bandit_feedback(): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() bf_keys = {"action", "position", "reward", "pscore", "context"} for k in bf_keys: assert len(bandit_feedback[k]) == len(bootstrap_bf[k]) bandit_feedback_timeseries: Dict = dataset.obtain_batch_bandit_feedback( is_timeseries_split=True)[0] bootstrap_bf_timeseries = dataset.sample_bootstrap_bandit_feedback( is_timeseries_split=True) for k in bf_keys: assert len(bandit_feedback_timeseries[k]) == len( bootstrap_bf_timeseries[k])
def test_calc_on_policy_policy_value_estimate(): ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="random", campaign="all") assert isinstance(ground_truth_policy_value, float)
def test_obtain_batch_bandit_feedback(): # invalid test_size with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split=True, test_size=1.3) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split=True, test_size=-0.5) with pytest.raises(TypeError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split=True, test_size="0.5") with pytest.raises(TypeError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split="True", test_size=0.5) # existence of keys # is_timeseries_split=False (default) dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() assert "n_rounds" in bandit_feedback.keys() assert "n_actions" in bandit_feedback.keys() assert "action" in bandit_feedback.keys() assert "position" in bandit_feedback.keys() assert "reward" in bandit_feedback.keys() assert "pscore" in bandit_feedback.keys() assert "context" in bandit_feedback.keys() assert "action_context" in bandit_feedback.keys() # is_timeseries_split=True bandit_feedback_timeseries = dataset.obtain_batch_bandit_feedback( is_timeseries_split=True) assert isinstance(bandit_feedback_timeseries, Tuple) bandit_feedback_train = bandit_feedback_timeseries[0] bandit_feedback_test = bandit_feedback_timeseries[1] bf_elems = { "n_rounds", "n_actions", "action", "position", "reward", "pscore", "context", "action_context", } assert all(k in bandit_feedback_train.keys() for k in bf_elems) assert all(k in bandit_feedback_test.keys() for k in bf_elems)
def test_sample_bootstrap_bandit_feedback(): with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(is_timeseries_split=True, test_size=1.3) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(is_timeseries_split=True, test_size=-0.5) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=-50) with pytest.raises(TypeError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=50.0) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.sample_bootstrap_bandit_feedback(sample_size=10000000) dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() bootstrap_bf = dataset.sample_bootstrap_bandit_feedback() bf_keys = {"action", "position", "reward", "pscore", "context"} for k in bf_keys: assert len(bandit_feedback[k]) == len(bootstrap_bf[k]) bandit_feedback_timeseries: Dict = dataset.obtain_batch_bandit_feedback( is_timeseries_split=True)[0] bootstrap_bf_timeseries = dataset.sample_bootstrap_bandit_feedback( is_timeseries_split=True) for k in bf_keys: assert len(bandit_feedback_timeseries[k]) == len( bootstrap_bf_timeseries[k]) sample_size = 1000 dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bootstrap_bf = dataset.sample_bootstrap_bandit_feedback( sample_size=sample_size) assert bootstrap_bf["n_rounds"] == sample_size for k in bf_keys: assert len(bootstrap_bf[k]) == sample_size
help="campaign name, men, women, or all.", ) parser.add_argument("--random_state", type=int, default=12345) args = parser.parse_args() print(args) n_boot_samples = args.n_boot_samples counterfactual_policy = args.counterfactual_policy behavior_policy = args.behavior_policy campaign = args.campaign random_state = args.random_state np.random.seed(random_state) data_path = Path("../open_bandit_dataset") obd = OpenBanditDataset( behavior_policy=behavior_policy, campaign=campaign, data_path=data_path ) # hyparparameters for counterfactual policies kwargs = dict( n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state ) if counterfactual_policy == "bts": kwargs["alpha"] = production_prior_for_bts[campaign]["alpha"] kwargs["beta"] = production_prior_for_bts[campaign]["beta"] kwargs["batch_size"] = production_batch_size_for_bts[campaign] policy = counterfactual_policy_dict[counterfactual_policy](**kwargs) # compared OPE estimators ope_estimators = [DirectMethod(), InverseProbabilityWeighting(), DoublyRobust()] # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV(LogisticRegression(**hyperparams)) # ground-truth policy value of a counterfactual policy
print(args) # configurations n_runs = args.n_runs base_model = args.base_model evaluation_policy = args.evaluation_policy behavior_policy = args.behavior_policy campaign = args.campaign n_sim_to_compute_action_dist = args.n_sim_to_compute_action_dist n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) data_path = Path(".").resolve().parents[1] / "obd" obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=data_path) # compute action distribution by evaluation policy kwargs = dict(n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state) if evaluation_policy == "bts": kwargs["is_zozotown_prior"] = True kwargs["campaign"] = campaign policy = evaluation_policy_dict[evaluation_policy](**kwargs) action_dist_single_round = policy.compute_batch_action_dist( n_sim=n_sim_to_compute_action_dist) # ground-truth policy value of an evaluation policy # , which is estimated with factual (observed) rewards (on-policy estimation) ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy=evaluation_policy,
is_timeseries_split = args.is_timeseries_split n_sim_to_compute_action_dist = args.n_sim_to_compute_action_dist n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) data_path = Path("../open_bandit_dataset") # prepare path log_path = (Path("./logs") / behavior_policy / campaign / "out_sample" / base_model if is_timeseries_split else Path("./logs") / behavior_policy / campaign / "in_sample" / base_model) reg_model_path = log_path / "trained_reg_models" reg_model_path.mkdir(exist_ok=True, parents=True) obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=data_path) # ground-truth policy value of a evaluation policy # , which is estimated with factual (observed) rewards (on-policy estimation) ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy=evaluation_policy, campaign=campaign, data_path=data_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # compute action distribution by evaluation policy if evaluation_policy == "bts": policy = BernoulliTS( n_actions=obd.n_actions, len_list=obd.len_list,
campaign = args.campaign n_sim_for_action_dist = args.n_sim_for_action_dist test_size = args.test_size is_timeseries_split = args.is_timeseries_split n_sim_to_compute_action_dist = args.n_sim_to_compute_action_dist random_state = args.random_state data_path = Path("../open_bandit_dataset") # prepare path log_path = (Path("./logs") / behavior_policy / campaign / "out_sample" / base_model if is_timeseries_split else Path("./logs") / behavior_policy / campaign / "in_sample" / base_model) reg_model_path = log_path / "trained_reg_models" reg_model_path.mkdir(exist_ok=True, parents=True) obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=data_path) # ground-truth policy value of a evaluation policy # , which is estimated with factual (observed) rewards (on-policy estimation) ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy=evaluation_policy, campaign=campaign, data_path=data_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) start = time.time() relative_ee = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators
is_mrdr = args.is_mrdr n_sim_to_compute_action_dist = args.n_sim_to_compute_action_dist n_jobs = args.n_jobs random_state = args.random_state np.random.seed(random_state) data_path = Path("../open_bandit_dataset") # prepare path log_path = (Path("./logs") / behavior_policy / campaign / "out_sample" / base_model if is_timeseries_split else Path("./logs") / behavior_policy / campaign / "in_sample" / base_model) reg_model_path = log_path / "trained_reg_models" reg_model_path.mkdir(exist_ok=True, parents=True) obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=data_path) # action distribution by evaluation policy # (more robust doubly robust needs evaluation policy information) if is_mrdr: if behavior_policy == "random": policy = BernoulliTS( n_actions=obd.n_actions, len_list=obd.len_list, is_zozotown_prior= True, # replicate the policy in the ZOZOTOWN production campaign=campaign, random_state=random_state, ) else: policy = Random(
def test_obtain_batch_bandit_feedback(): # invalid test_size with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split=True, test_size=1.3) with pytest.raises(ValueError): dataset = OpenBanditDataset(behavior_policy="random", campaign="all") dataset.obtain_batch_bandit_feedback(is_timeseries_split=True, test_size=-0.5) # existence of keys # is_timeseries_split=False (default) dataset = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback = dataset.obtain_batch_bandit_feedback() assert "n_rounds" in bandit_feedback.keys() assert "n_actions" in bandit_feedback.keys() assert "action" in bandit_feedback.keys() assert "position" in bandit_feedback.keys() assert "reward" in bandit_feedback.keys() assert "pscore" in bandit_feedback.keys() assert "context" in bandit_feedback.keys() assert "action_context" in bandit_feedback.keys() # is_timeseries_split=True dataset2 = OpenBanditDataset(behavior_policy="random", campaign="all") bandit_feedback2 = dataset2.obtain_batch_bandit_feedback( is_timeseries_split=True) assert "n_rounds" in bandit_feedback2.keys() assert "n_actions" in bandit_feedback2.keys() assert "action" in bandit_feedback2.keys() assert "action_test" in bandit_feedback2.keys() assert "position" in bandit_feedback2.keys() assert "position_test" in bandit_feedback2.keys() assert "reward" in bandit_feedback2.keys() assert "reward_test" in bandit_feedback2.keys() assert "pscore" in bandit_feedback2.keys() assert "pscore_test" in bandit_feedback2.keys() assert "context" in bandit_feedback2.keys() assert "context_test" in bandit_feedback2.keys() assert "action_context" in bandit_feedback2.keys()
def main(cfg: DictConfig) -> None: print(cfg) logger.info(f"The current working directory is {Path().cwd()}") start_time = time.time() logger.info("initializing experimental condition..") # compared ope estimators lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"]) ope_estimators = [ DoublyRobustWithShrinkage(lambda_=lam_, estimator_name=f"DRos ({lam_})") for lam_ in lambdas ] + [ DoublyRobustWithShrinkageTuning(lambdas=lambdas, estimator_name="DRos (tuning)"), ] # configurations n_seeds = cfg.setting.n_seeds sample_size = cfg.setting.sample_size reg_model = cfg.setting.reg_model campaign = cfg.setting.campaign behavior_policy = cfg.setting.behavior_policy test_size = cfg.setting.test_size is_timeseries_split = cfg.setting.is_timeseries_split n_folds = cfg.setting.n_folds obd_path = (Path().cwd().parents[5] / "open_bandit_dataset" if cfg.setting.is_full_obd else None) random_state = cfg.setting.random_state np.random.seed(random_state) # define dataset dataset_ts = OpenBanditDataset(behavior_policy="bts", campaign=campaign, data_path=obd_path) dataset_ur = OpenBanditDataset(behavior_policy="random", campaign=campaign, data_path=obd_path) # prepare logged bandit feedback and evaluation policies if behavior_policy == "random": if is_timeseries_split: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ur] # obtain the ground-truth policy value ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ts = BernoulliTS( n_actions=dataset_ts.n_actions, len_list=dataset_ts.len_list, random_state=random_state, is_zozotown_prior=True, campaign=campaign, ) action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ts, action_dist_ts)] else: if is_timeseries_split: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ts] # obtain the ground-truth policy value ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="random", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ur = Random( n_actions=dataset_ur.n_actions, len_list=dataset_ur.len_list, random_state=random_state, ) action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ur, action_dist_ur)] # regression models used in ope estimators hyperparams = dict(cfg.reg_model_hyperparams)[reg_model] regression_models = [reg_model_dict[reg_model](**hyperparams)] # define an evaluator class evaluator = InterpretableOPEEvaluator( random_states=np.arange(n_seeds), bandit_feedbacks=bandit_feedbacks, evaluation_policies=evaluation_policies, ope_estimators=ope_estimators, regression_models=regression_models, ) # conduct an evaluation of OPE experiment logger.info("experiment started") _ = evaluator.estimate_policy_value(sample_size=sample_size, n_folds_=n_folds) # calculate statistics mean = evaluator.calculate_mean(root=True) mean_scaled = evaluator.calculate_mean(scale=True, root=True) # save results of the evaluation of off-policy estimators log_path = Path("./outputs/hypara") log_path.mkdir(exist_ok=True, parents=True) # save root mse root_mse_df = DataFrame() root_mse_df["estimator"] = list(mean.keys()) root_mse_df["mean"] = list(mean.values()) root_mse_df["mean(scaled)"] = list(mean_scaled.values()) root_mse_df.to_csv(log_path / "root_mse.csv") # conduct pairwise t-tests se_df = DataFrame(evaluator.calculate_squared_error()) se_df = DataFrame(se_df.stack()).reset_index(1) se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True) nonparam_ttests = (pg.pairwise_ttests( data=se_df, dv="se", parametric=False, between="estimators", ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1)) nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv") # print result print(root_mse_df) experiment = f"{campaign}-{behavior_policy}-{sample_size}" elapsed_time = np.round((time.time() - start_time) / 60, 2) logger.info(f"finish experiment {experiment} in {elapsed_time}min")
base_model = args.base_model behavior_policy = args.behavior_policy campaign = args.campaign test_size = args.test_size is_timeseries_split = args.is_timeseries_split random_state = args.random_state data_path = Path("../open_bandit_dataset") # prepare path log_path = (Path("./logs") / behavior_policy / campaign / "out_sample" / base_model if is_timeseries_split else Path("./logs") / behavior_policy / campaign / "in_sample" / base_model) reg_model_path = log_path / "trained_reg_models" reg_model_path.mkdir(exist_ok=True, parents=True) obd = OpenBanditDataset(behavior_policy=behavior_policy, campaign=campaign, data_path=data_path) start_time = time.time() performance_of_reg_model = { metrics[i]: np.zeros(n_boot_samples) for i in np.arange(len(metrics)) } for b in np.arange(n_boot_samples): # sample bootstrap samples from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b) # split data into two folds (data for training reg_model and for ope) is_for_reg_model = np.random.binomial( n=1, p=0.3, size=boot_bandit_feedback["n_rounds"]).astype(bool)