def random_action_dist(synthetic_bandit_feedback) -> np.ndarray: n_actions = synthetic_bandit_feedback["n_actions"] evaluation_policy = Random(n_actions=n_actions, len_list=1) action_dist = evaluation_policy.compute_batch_action_dist( n_rounds=synthetic_bandit_feedback["n_rounds"] ) return action_dist
is_timeseries_split=is_timeseries_split, ) # compute action distribution by evaluation policy if evaluation_policy == "bts": policy = BernoulliTS( n_actions=obd.n_actions, len_list=obd.len_list, is_zozotown_prior= True, # replicate the policy in the ZOZOTOWN production campaign=campaign, random_state=random_state, ) else: policy = Random( n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state, ) action_dist_single_round = policy.compute_batch_action_dist( n_sim=n_sim_to_compute_action_dist) def process(b: int): # load the pre-trained regression model with open(reg_model_path / f"reg_model_{b}.pkl", "rb") as f: reg_model = pickle.load(f) with open(reg_model_path / f"reg_model_mrdr_{b}.pkl", "rb") as f: reg_model_mrdr = pickle.load(f) with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "rb") as f: is_for_reg_model = pickle.load(f) # sample bootstrap samples from batch logged bandit feedback bandit_feedback = obd.sample_bootstrap_bandit_feedback(
# estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # define random evaluation policy random_policy = Random(n_actions=dataset.n_actions, random_state=random_state) # define evaluation policy using IPWLearner ipw_learner = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # define evaluation policy using NNPolicyLearner nn_policy_learner = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator_dict[ope_estimator]. estimate_policy_value_tensor, hidden_layer_size=tuple((n_hidden for _ in range(n_layers))), activation=activation, solver=solver,
def main(cfg: DictConfig) -> None: print(cfg) logger.info(f"The current working directory is {Path().cwd()}") start_time = time.time() logger.info("initializing experimental condition..") # compared ope estimators lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"]) ope_estimators = [ DoublyRobustWithShrinkage(lambda_=lam_, estimator_name=f"DRos ({lam_})") for lam_ in lambdas ] + [ DoublyRobustWithShrinkageTuning(lambdas=lambdas, estimator_name="DRos (tuning)"), ] # configurations n_seeds = cfg.setting.n_seeds sample_size = cfg.setting.sample_size reg_model = cfg.setting.reg_model campaign = cfg.setting.campaign behavior_policy = cfg.setting.behavior_policy test_size = cfg.setting.test_size is_timeseries_split = cfg.setting.is_timeseries_split n_folds = cfg.setting.n_folds obd_path = (Path().cwd().parents[5] / "open_bandit_dataset" if cfg.setting.is_full_obd else None) random_state = cfg.setting.random_state np.random.seed(random_state) # define dataset dataset_ts = OpenBanditDataset(behavior_policy="bts", campaign=campaign, data_path=obd_path) dataset_ur = OpenBanditDataset(behavior_policy="random", campaign=campaign, data_path=obd_path) # prepare logged bandit feedback and evaluation policies if behavior_policy == "random": if is_timeseries_split: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ur] # obtain the ground-truth policy value ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ts = BernoulliTS( n_actions=dataset_ts.n_actions, len_list=dataset_ts.len_list, random_state=random_state, is_zozotown_prior=True, campaign=campaign, ) action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ts, action_dist_ts)] else: if is_timeseries_split: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ts] # obtain the ground-truth policy value ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="random", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ur = Random( n_actions=dataset_ur.n_actions, len_list=dataset_ur.len_list, random_state=random_state, ) action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ur, action_dist_ur)] # regression models used in ope estimators hyperparams = dict(cfg.reg_model_hyperparams)[reg_model] regression_models = [reg_model_dict[reg_model](**hyperparams)] # define an evaluator class evaluator = InterpretableOPEEvaluator( random_states=np.arange(n_seeds), bandit_feedbacks=bandit_feedbacks, evaluation_policies=evaluation_policies, ope_estimators=ope_estimators, regression_models=regression_models, ) # conduct an evaluation of OPE experiment logger.info("experiment started") _ = evaluator.estimate_policy_value(sample_size=sample_size, n_folds_=n_folds) # calculate statistics mean = evaluator.calculate_mean(root=True) mean_scaled = evaluator.calculate_mean(scale=True, root=True) # save results of the evaluation of off-policy estimators log_path = Path("./outputs/hypara") log_path.mkdir(exist_ok=True, parents=True) # save root mse root_mse_df = DataFrame() root_mse_df["estimator"] = list(mean.keys()) root_mse_df["mean"] = list(mean.values()) root_mse_df["mean(scaled)"] = list(mean_scaled.values()) root_mse_df.to_csv(log_path / "root_mse.csv") # conduct pairwise t-tests se_df = DataFrame(evaluator.calculate_squared_error()) se_df = DataFrame(se_df.stack()).reset_index(1) se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True) nonparam_ttests = (pg.pairwise_ttests( data=se_df, dv="se", parametric=False, between="estimators", ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1)) nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv") # print result print(root_mse_df) experiment = f"{campaign}-{behavior_policy}-{sample_size}" elapsed_time = np.round((time.time() - start_time) / 60, 2) logger.info(f"finish experiment {experiment} in {elapsed_time}min")