def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy] ), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict( context=bandit_feedback_test["context"], ) # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model] ), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def process(b: int): # sample bootstrap from batch logged bandit feedback bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # estimate the mean reward function with an ML model regression_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=obd.action_context, base_model=base_model_dict[base_model](**hyperparams[base_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], position=bandit_feedback["position"], pscore=bandit_feedback["pscore"], n_folds=3, # 3-fold cross-fitting ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) action_dist = np.tile(action_dist_single_round, (bandit_feedback["n_rounds"], 1, 1)) relative_ee_b = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_b
def process(i: int): # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit feedback evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], tau=0.1, # temperature hyperparameter ) # estimate the ground-truth policy values of the evaluation policy # using the full expected reward contained in the test set of synthetic bandit feedback ground_truth_policy_value = np.average( bandit_feedback_test["expected_reward"], weights=action_dist[:, :, 0], axis=1, ).mean() # estimate the mean reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, len_list=dataset.len_list, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_fitting_regression_models_using_invalid_input_data( context: np.ndarray, action: np.ndarray, reward: np.ndarray, pscore: np.ndarray, position: np.ndarray, action_context: np.ndarray, n_actions: int, len_list: int, fitting_method: str, base_model: BaseEstimator, action_dist: np.ndarray, n_folds: int, random_state: int, err, description: str, ) -> None: # fit_predict function raises ValueError with pytest.raises(err, match=f"{description}*"): regression_model = RegressionModel( n_actions=n_actions, len_list=len_list, action_context=action_context, base_model=base_model, fitting_method=fitting_method, ) if fitting_method == "normal": # train regression model on logged bandit feedback data _ = regression_model.fit_predict( context=context, action=action, reward=reward, position=position, n_folds=n_folds, random_state=random_state, ) else: # train regression model on logged bandit feedback data _ = regression_model.fit_predict( context=context, action=action, reward=reward, pscore=pscore, position=position, action_dist=action_dist, n_folds=n_folds, random_state=random_state, )
def process(i: int): # split the original data into training and evaluation sets dataset.split_train_eval(eval_size=eval_size, random_state=i) # obtain logged bandit feedback generated by behavior policy bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i) # obtain action choice probabilities by an evaluation policy action_dist = dataset.obtain_action_dist_by_eval_policy( base_classifier_e=base_model_dict[base_model_for_evaluation_policy] (**hyperparams[base_model_for_evaluation_policy]), alpha_e=alpha_e, ) # calculate the ground-truth performance of the evaluation policy ground_truth_policy_value = dataset.calc_ground_truth_policy_value( action_dist=action_dist) # estimate the mean reward function of the evaluation set of multi-class classification data with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, ope_estimators=ope_estimators, ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=ground_truth_policy_value, action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return relative_ee_i
def test_initializing_regression_models_using_invalid_input_data( action_context: np.ndarray, n_actions: int, len_list: int, fitting_method: str, base_model: BaseEstimator, description: str, ) -> None: # initialization raises ValueError with pytest.raises(ValueError, match=f"{description}*"): _ = RegressionModel( n_actions=n_actions, len_list=len_list, action_context=action_context, base_model=base_model, fitting_method=fitting_method, )
def process(i: int): # synthetic data generator dataset = SyntheticBanditDatasetWithActionEmbeds( n_actions=n_actions, dim_context=dim_context, beta=3.0, n_cat_dim=3, n_cat_per_dim=5, reward_function=logistic_reward_function, random_state=i, ) # define evaluation policy using IPWLearner evaluation_policy = IPWLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_iw_estimator]( **hyperparams[base_model_for_iw_estimator]), ) # sample new training and test sets of synthetic logged bandit data bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # train the evaluation policy on the training set of the synthetic logged bandit data evaluation_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit data action_dist = evaluation_policy.predict_proba( context=bandit_feedback_test["context"], ) # estimate the reward function of the test set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_test["context"], action=bandit_feedback_test["action"], reward=bandit_feedback_test["reward"], n_folds=2, random_state=12345, ) # fit propensity score estimators pscore_estimator = PropensityScoreEstimator( len_list=1, n_actions=n_actions, base_model=base_model_dict[base_model_for_pscore_estimator]( **hyperparams[base_model_for_pscore_estimator]), calibration_cv=3, ) estimated_pscore = pscore_estimator.fit_predict( action=bandit_feedback_test["action"], position=bandit_feedback_test["position"], context=bandit_feedback_test["context"], n_folds=3, random_state=12345, ) # fit importance weight estimators estimated_importance_weights_dict = {} for clf_name, clf_arguments in bipw_model_configurations.items(): clf = ImportanceWeightEstimator( len_list=1, n_actions=n_actions, fitting_method=clf_arguments["fitting_method"], base_model=clf_arguments["base_model"], ) estimated_importance_weights_dict[clf_name] = clf.fit_predict( action=bandit_feedback_test["action"], context=bandit_feedback_test["context"], action_dist=action_dist, position=bandit_feedback_test["position"], n_folds=2, evaluate_model_performance=False, random_state=12345, ) # evaluate estimators' performances using relative estimation error (relative-ee) ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback_test, ope_estimators=ope_estimators + [ MarginalizedInverseProbabilityWeighting(n_actions=n_actions, estimator_name="mipw"), MarginalizedInverseProbabilityWeighting( n_actions=n_actions, embedding_selection_method="greedy", estimator_name="mipw (greedy selection)", ), SelfNormalizedMarginalizedInverseProbabilityWeighting( n_actions=n_actions, estimator_name="snmipw"), ], ) relative_ee_i = ope.evaluate_performance_of_estimators( ground_truth_policy_value=dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=action_dist, ), action_dist=action_dist, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, estimated_pscore=estimated_pscore, estimated_importance_weights=estimated_importance_weights_dict, action_embed=bandit_feedback_test["action_embed"], pi_b=bandit_feedback_test["pi_b"], metric="relative-ee", ) return relative_ee_i
evaluation_of_ope_results = { est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators } for b in np.arange(n_boot_samples): # sample bootstrap from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation( bandit_feedback=boot_bandit_feedback, policy=policy ) # evaluate the estimation performance of OPE estimators ope = OffPolicyEvaluation( bandit_feedback=boot_bandit_feedback, action_context=obd.action_context, regression_model=RegressionModel(base_model=base_model), ope_estimators=ope_estimators, ) relative_estimation_errors = ope.evaluate_performance_of_estimators( selected_actions=selected_actions, ground_truth_policy_value=ground_truth_policy_value, ) policy.initialize() # store relative estimation errors of OPE estimators at each split for ( estimator_name, relative_estimation_error, ) in relative_estimation_errors.items(): evaluation_of_ope_results[estimator_name][b] = relative_estimation_error # estimate confidence intervals of relative estimation by nonparametric bootstrap method
def process(i: int): # synthetic data generator dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=i, ) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) ope_estimator = DoublyRobust() # define evaluation policy using NNPolicyLearner nn_policy = NNPolicyLearner( n_actions=dataset.n_actions, dim_context=dim_context, off_policy_objective=ope_estimator.estimate_policy_value_tensor, ) # baseline method 1. RandomPolicy random_policy = RandomPolicy(n_actions=dataset.n_actions) # baseline method 2. UniformSampleWeightLearner uniform_sample_weight_policy = UniformSampleWeightLearner( n_actions=dataset.n_actions, base_classifier=base_model_dict[base_model_for_evaluation_policy]( **hyperparams[base_model_for_evaluation_policy]), ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=12345, ) # train the evaluation policy on the training set of the synthetic logged bandit feedback nn_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) uniform_sample_weight_policy.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], ) # predict the action decisions for the test set of the synthetic logged bandit feedback nn_policy_action_dist = nn_policy.predict( context=bandit_feedback_test["context"], ) random_action_dist = random_policy.predict( context=bandit_feedback_test["context"], ) uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict( context=bandit_feedback_test["context"], ) # get the ground truth policy value for each learner gt_nn_policy_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=nn_policy_action_dist, ) gt_random_policy = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=random_action_dist, ) gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value( expected_reward=bandit_feedback_test["expected_reward"], action_dist=uniform_sample_weight_action_dist, ) return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
def test_performance_of_binary_outcome_models( fixed_synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray ) -> None: """ Test the performance of ope estimators using synthetic bandit data and random evaluation policy when the regression model is estimated by a logistic regression """ bandit_feedback = fixed_synthetic_bandit_feedback.copy() expected_reward = np.expand_dims(bandit_feedback["expected_reward"], axis=-1) action_dist = random_action_dist # compute ground truth policy value using expected reward q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1) # compute statistics of ground truth policy value gt_mean = q_pi_e.mean() random_state = 12345 auc_scores: Dict[str, float] = {} # check ground truth print(f"gt_mean: {gt_mean}") # check the performance of regression models using doubly robust criteria (|\hat{q} - q| <= |q| is satisfied with a high probability) dr_criteria_pass_rate = 0.8 fit_methods = ["normal", "iw", "mrdr"] for fit_method in fit_methods: for model_name, model in binary_model_dict.items(): regression_model = RegressionModel( n_actions=bandit_feedback["n_actions"], len_list=int(bandit_feedback["position"].max() + 1), action_context=bandit_feedback["action_context"], base_model=model(**hyperparams[model_name]), fitting_method=fit_method, ) if fit_method == "normal": # train regression model on logged bandit feedback data estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) else: # train regression model on logged bandit feedback data estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback["context"], action=bandit_feedback["action"], reward=bandit_feedback["reward"], pscore=bandit_feedback["pscore"], position=bandit_feedback["position"], action_dist=action_dist, n_folds=3, # 3-fold cross-fitting random_state=random_state, ) auc_scores[model_name + "_" + fit_method] = roc_auc_score( y_true=bandit_feedback["reward"], y_score=estimated_rewards_by_reg_model[ np.arange(bandit_feedback["reward"].shape[0]), bandit_feedback["action"], bandit_feedback["position"], ], ) # compare dr criteria dr_criteria = np.abs((gt_mean - estimated_rewards_by_reg_model)) - np.abs( gt_mean ) print( f"Dr criteria is satisfied with probability {np.mean(dr_criteria <= 0)} ------ model: {model_name} ({fit_method})," ) assert ( np.mean(dr_criteria <= 0) >= dr_criteria_pass_rate ), f" should be satisfied with a probability at least {dr_criteria_pass_rate}" for model_name in auc_scores: print(f"AUC of {model_name} is {auc_scores[model_name]}") assert ( auc_scores[model_name] > 0.5 ), f"AUC of {model_name} should be greater than 0.5"
dataset = SyntheticBanditDataset( n_actions=n_actions, dim_context=dim_context, reward_function=logistic_reward_function, behavior_policy_function=linear_behavior_policy, random_state=random_state, ) # sample new training and test sets of synthetic logged bandit feedback bandit_feedback_train = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) bandit_feedback_test = dataset.obtain_batch_bandit_feedback( n_rounds=n_rounds) # estimate the mean reward function of the train set of synthetic bandit feedback with ML model regression_model = RegressionModel( n_actions=dataset.n_actions, action_context=dataset.action_context, base_model=base_model_dict[base_model_for_reg_model]( **hyperparams[base_model_for_reg_model]), ) estimated_rewards_by_reg_model = regression_model.fit_predict( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], n_folds=3, # 3-fold cross-fitting random_state=random_state, ) # define random evaluation policy random_policy = Random(n_actions=dataset.n_actions, random_state=random_state) # define evaluation policy using IPWLearner ipw_learner = IPWLearner( n_actions=dataset.n_actions,
def process(b: int): # sample bootstrap from batch logged bandit feedback bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b, ) # split data into two folds (data for training reg_model and for ope) is_for_reg_model = np.random.binomial( n=1, p=0.3, size=bandit_feedback["n_rounds"]).astype(bool) with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "wb") as f: pickle.dump( is_for_reg_model, f, ) if is_mrdr: reg_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=bandit_feedback["action_context"], base_model=base_model_dict[base_model]( **hyperparams[base_model]), fitting_method="mrdr", ) # train regression model on logged bandit feedback data reg_model.fit( context=bandit_feedback["context"][is_for_reg_model], action=bandit_feedback["action"][is_for_reg_model], reward=bandit_feedback["reward"][is_for_reg_model], pscore=bandit_feedback["pscore"][is_for_reg_model], position=bandit_feedback["position"][is_for_reg_model], action_dist=np.tile(action_dist_single_round, (is_for_reg_model.sum(), 1, 1)), ) with open(reg_model_path / f"reg_model_mrdr_{b}.pkl", "wb") as f: pickle.dump( reg_model, f, ) else: reg_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=bandit_feedback["action_context"], base_model=base_model_dict[base_model]( **hyperparams[base_model]), fitting_method="normal", ) # train regression model on logged bandit feedback data reg_model.fit( context=bandit_feedback["context"][is_for_reg_model], action=bandit_feedback["action"][is_for_reg_model], reward=bandit_feedback["reward"][is_for_reg_model], position=bandit_feedback["position"][is_for_reg_model], ) with open(reg_model_path / f"reg_model_{b}.pkl", "wb") as f: pickle.dump( reg_model, f, ) # evaluate the estimation performance of the regression model by AUC and RCE if is_timeseries_split: estimated_rewards_by_reg_model = reg_model.predict( context=bandit_feedback["context_test"], ) else: estimated_rewards_by_reg_model = reg_model.predict( context=bandit_feedback["context"][~is_for_reg_model], ) performance_reg_model_b = evaluate_reg_model( bandit_feedback=bandit_feedback, is_timeseries_split=is_timeseries_split, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, is_for_reg_model=is_for_reg_model, ) return performance_reg_model_b
def process(b: int) -> Dict[str, float]: # sample bootstrap from batch logged bandit feedback if is_timeseries_split: bandit_feedback_train = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=True, random_state=b, ) bandit_feedback_test = obd.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[1] else: bandit_feedback_train = obd.sample_bootstrap_bandit_feedback( random_state=b, ) bandit_feedback_test = deepcopy(bandit_feedback_train) # split data into two folds (data for training reg_model and for ope) is_for_reg_model = np.random.binomial( n=1, p=0.3, size=bandit_feedback_train["n_rounds"]).astype(bool) with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "wb") as f: pickle.dump( is_for_reg_model, f, ) bandit_feedback_train["n_rounds"] = is_for_reg_model.sum() bandit_feedback_test["n_rounds"] = (~is_for_reg_model).sum() for key in ["context", "action", "reward", "pscore", "position"]: bandit_feedback_train[key] = bandit_feedback_train[key][ is_for_reg_model] bandit_feedback_test[key] = bandit_feedback_test[key][ ~is_for_reg_model] model_file_name = f"reg_model_mrdr_{b}.pkl" if is_mrdr else f"reg_model_{b}.pkl" reg_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=bandit_feedback_train["action_context"], base_model=base_model_dict[base_model](**hyperparams[base_model]), fitting_method=fitting_method, ) # train regression model on logged bandit feedback data reg_model.fit( context=bandit_feedback_train["context"], action=bandit_feedback_train["action"], reward=bandit_feedback_train["reward"], pscore=bandit_feedback_train["pscore"], position=bandit_feedback_train["position"], action_dist=np.tile(action_dist_single_round, (bandit_feedback_train["n_rounds"], 1, 1)), ) with open(reg_model_path / model_file_name, "wb") as f: pickle.dump( reg_model, f, ) # evaluate the estimation performance of the regression model by AUC and RCE estimated_rewards_by_reg_model = reg_model.predict( context=bandit_feedback_test["context"], ) performance_reg_model_b = evaluate_reg_model( bandit_feedback=bandit_feedback_test, estimated_rewards_by_reg_model=estimated_rewards_by_reg_model, ) return performance_reg_model_b
metrics[i]: np.zeros(n_boot_samples) for i in np.arange(len(metrics)) } for b in np.arange(n_boot_samples): # sample bootstrap samples from batch logged bandit feedback boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback( test_size=test_size, is_timeseries_split=is_timeseries_split, random_state=b) # split data into two folds (data for training reg_model and for ope) is_for_reg_model = np.random.binomial( n=1, p=0.3, size=boot_bandit_feedback["n_rounds"]).astype(bool) # define regression model reg_model = RegressionModel( n_actions=obd.n_actions, len_list=obd.len_list, action_context=boot_bandit_feedback["action_context"], base_model=base_model_dict[base_model](**hyperparams[base_model]), ) # train regression model on logged bandit feedback data reg_model.fit( context=boot_bandit_feedback["context"][is_for_reg_model], action=boot_bandit_feedback["action"][is_for_reg_model], reward=boot_bandit_feedback["reward"][is_for_reg_model], position=boot_bandit_feedback["position"][is_for_reg_model], ) # evaluate the estimation performance of the regression model by AUC and RCE if is_timeseries_split: estimated_reward_by_reg_model = reg_model.predict( context=boot_bandit_feedback["context_test"], ) rewards = boot_bandit_feedback["reward_test"] estimated_rewards_ = estimated_reward_by_reg_model[