Exemple #1
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]
            ),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict(
            context=bandit_feedback_test["context"],
        )
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]
            ),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
    def process(b: int):
        # sample bootstrap from batch logged bandit feedback
        bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
        # estimate the mean reward function with an ML model
        regression_model = RegressionModel(
            n_actions=obd.n_actions,
            len_list=obd.len_list,
            action_context=obd.action_context,
            base_model=base_model_dict[base_model](**hyperparams[base_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            reward=bandit_feedback["reward"],
            position=bandit_feedback["position"],
            pscore=bandit_feedback["pscore"],
            n_folds=3,  # 3-fold cross-fitting
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        action_dist = np.tile(action_dist_single_round,
                              (bandit_feedback["n_rounds"], 1, 1))
        relative_ee_b = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_b
    def process(i: int):
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"],
            tau=0.1,  # temperature hyperparameter
        )
        # estimate the ground-truth policy values of the evaluation policy
        # using the full expected reward contained in the test set of synthetic bandit feedback
        ground_truth_policy_value = np.average(
            bandit_feedback_test["expected_reward"],
            weights=action_dist[:, :, 0],
            axis=1,
        ).mean()
        # estimate the mean reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            len_list=dataset.len_list,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
def test_fitting_regression_models_using_invalid_input_data(
    context: np.ndarray,
    action: np.ndarray,
    reward: np.ndarray,
    pscore: np.ndarray,
    position: np.ndarray,
    action_context: np.ndarray,
    n_actions: int,
    len_list: int,
    fitting_method: str,
    base_model: BaseEstimator,
    action_dist: np.ndarray,
    n_folds: int,
    random_state: int,
    err,
    description: str,
) -> None:
    # fit_predict function raises ValueError
    with pytest.raises(err, match=f"{description}*"):
        regression_model = RegressionModel(
            n_actions=n_actions,
            len_list=len_list,
            action_context=action_context,
            base_model=base_model,
            fitting_method=fitting_method,
        )
        if fitting_method == "normal":
            # train regression model on logged bandit feedback data
            _ = regression_model.fit_predict(
                context=context,
                action=action,
                reward=reward,
                position=position,
                n_folds=n_folds,
                random_state=random_state,
            )
        else:
            # train regression model on logged bandit feedback data
            _ = regression_model.fit_predict(
                context=context,
                action=action,
                reward=reward,
                pscore=pscore,
                position=position,
                action_dist=action_dist,
                n_folds=n_folds,
                random_state=random_state,
            )
Exemple #5
0
    def process(i: int):
        # split the original data into training and evaluation sets
        dataset.split_train_eval(eval_size=eval_size, random_state=i)
        # obtain logged bandit feedback generated by behavior policy
        bandit_feedback = dataset.obtain_batch_bandit_feedback(random_state=i)
        # obtain action choice probabilities by an evaluation policy
        action_dist = dataset.obtain_action_dist_by_eval_policy(
            base_classifier_e=base_model_dict[base_model_for_evaluation_policy]
            (**hyperparams[base_model_for_evaluation_policy]),
            alpha_e=alpha_e,
        )
        # calculate the ground-truth performance of the evaluation policy
        ground_truth_policy_value = dataset.calc_ground_truth_policy_value(
            action_dist=action_dist)
        # estimate the mean reward function of the evaluation set of multi-class classification data with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback["context"],
            action=bandit_feedback["action"],
            reward=bandit_feedback["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=random_state,
        )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback,
            ope_estimators=ope_estimators,
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=ground_truth_policy_value,
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return relative_ee_i
Exemple #6
0
def test_initializing_regression_models_using_invalid_input_data(
    action_context: np.ndarray,
    n_actions: int,
    len_list: int,
    fitting_method: str,
    base_model: BaseEstimator,
    description: str,
) -> None:
    # initialization raises ValueError
    with pytest.raises(ValueError, match=f"{description}*"):
        _ = RegressionModel(
            n_actions=n_actions,
            len_list=len_list,
            action_context=action_context,
            base_model=base_model,
            fitting_method=fitting_method,
        )
Exemple #7
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDatasetWithActionEmbeds(
            n_actions=n_actions,
            dim_context=dim_context,
            beta=3.0,
            n_cat_dim=3,
            n_cat_per_dim=5,
            reward_function=logistic_reward_function,
            random_state=i,
        )
        # define evaluation policy using IPWLearner
        evaluation_policy = IPWLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_iw_estimator](
                **hyperparams[base_model_for_iw_estimator]),
        )
        # sample new training and test sets of synthetic logged bandit data
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # train the evaluation policy on the training set of the synthetic logged bandit data
        evaluation_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit data
        action_dist = evaluation_policy.predict_proba(
            context=bandit_feedback_test["context"], )
        # estimate the reward function of the test set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_test["context"],
            action=bandit_feedback_test["action"],
            reward=bandit_feedback_test["reward"],
            n_folds=2,
            random_state=12345,
        )
        # fit propensity score estimators
        pscore_estimator = PropensityScoreEstimator(
            len_list=1,
            n_actions=n_actions,
            base_model=base_model_dict[base_model_for_pscore_estimator](
                **hyperparams[base_model_for_pscore_estimator]),
            calibration_cv=3,
        )
        estimated_pscore = pscore_estimator.fit_predict(
            action=bandit_feedback_test["action"],
            position=bandit_feedback_test["position"],
            context=bandit_feedback_test["context"],
            n_folds=3,
            random_state=12345,
        )
        # fit importance weight estimators
        estimated_importance_weights_dict = {}
        for clf_name, clf_arguments in bipw_model_configurations.items():
            clf = ImportanceWeightEstimator(
                len_list=1,
                n_actions=n_actions,
                fitting_method=clf_arguments["fitting_method"],
                base_model=clf_arguments["base_model"],
            )
            estimated_importance_weights_dict[clf_name] = clf.fit_predict(
                action=bandit_feedback_test["action"],
                context=bandit_feedback_test["context"],
                action_dist=action_dist,
                position=bandit_feedback_test["position"],
                n_folds=2,
                evaluate_model_performance=False,
                random_state=12345,
            )
        # evaluate estimators' performances using relative estimation error (relative-ee)
        ope = OffPolicyEvaluation(
            bandit_feedback=bandit_feedback_test,
            ope_estimators=ope_estimators + [
                MarginalizedInverseProbabilityWeighting(n_actions=n_actions,
                                                        estimator_name="mipw"),
                MarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions,
                    embedding_selection_method="greedy",
                    estimator_name="mipw (greedy selection)",
                ),
                SelfNormalizedMarginalizedInverseProbabilityWeighting(
                    n_actions=n_actions, estimator_name="snmipw"),
            ],
        )
        relative_ee_i = ope.evaluate_performance_of_estimators(
            ground_truth_policy_value=dataset.calc_ground_truth_policy_value(
                expected_reward=bandit_feedback_test["expected_reward"],
                action_dist=action_dist,
            ),
            action_dist=action_dist,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
            estimated_pscore=estimated_pscore,
            estimated_importance_weights=estimated_importance_weights_dict,
            action_embed=bandit_feedback_test["action_embed"],
            pi_b=bandit_feedback_test["pi_b"],
            metric="relative-ee",
        )

        return relative_ee_i
Exemple #8
0
    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators
    }
    for b in np.arange(n_boot_samples):
        # sample bootstrap from batch logged bandit feedback
        boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
        # run a counterfactual bandit algorithm on logged bandit feedback data
        selected_actions = run_bandit_simulation(
            bandit_feedback=boot_bandit_feedback, policy=policy
        )
        # evaluate the estimation performance of OPE estimators
        ope = OffPolicyEvaluation(
            bandit_feedback=boot_bandit_feedback,
            action_context=obd.action_context,
            regression_model=RegressionModel(base_model=base_model),
            ope_estimators=ope_estimators,
        )
        relative_estimation_errors = ope.evaluate_performance_of_estimators(
            selected_actions=selected_actions,
            ground_truth_policy_value=ground_truth_policy_value,
        )
        policy.initialize()
        # store relative estimation errors of OPE estimators at each split
        for (
            estimator_name,
            relative_estimation_error,
        ) in relative_estimation_errors.items():
            evaluation_of_ope_results[estimator_name][b] = relative_estimation_error

    # estimate confidence intervals of relative estimation by nonparametric bootstrap method
Exemple #9
0
    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        ope_estimator = DoublyRobust()
        # define evaluation policy using NNPolicyLearner
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective=ope_estimator.estimate_policy_value_tensor,
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=12345,
        )
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        nn_policy_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_nn_policy_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_policy_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner
Exemple #10
0
def test_performance_of_binary_outcome_models(
    fixed_synthetic_bandit_feedback: BanditFeedback, random_action_dist: np.ndarray
) -> None:
    """
    Test the performance of ope estimators using synthetic bandit data and random evaluation policy
    when the regression model is estimated by a logistic regression
    """
    bandit_feedback = fixed_synthetic_bandit_feedback.copy()
    expected_reward = np.expand_dims(bandit_feedback["expected_reward"], axis=-1)
    action_dist = random_action_dist
    # compute ground truth policy value using expected reward
    q_pi_e = np.average(expected_reward[:, :, 0], weights=action_dist[:, :, 0], axis=1)
    # compute statistics of ground truth policy value
    gt_mean = q_pi_e.mean()
    random_state = 12345
    auc_scores: Dict[str, float] = {}
    # check ground truth
    print(f"gt_mean: {gt_mean}")
    # check the performance of regression models using doubly robust criteria (|\hat{q} - q| <= |q| is satisfied with a high probability)
    dr_criteria_pass_rate = 0.8
    fit_methods = ["normal", "iw", "mrdr"]
    for fit_method in fit_methods:
        for model_name, model in binary_model_dict.items():
            regression_model = RegressionModel(
                n_actions=bandit_feedback["n_actions"],
                len_list=int(bandit_feedback["position"].max() + 1),
                action_context=bandit_feedback["action_context"],
                base_model=model(**hyperparams[model_name]),
                fitting_method=fit_method,
            )
            if fit_method == "normal":
                # train regression model on logged bandit feedback data
                estimated_rewards_by_reg_model = regression_model.fit_predict(
                    context=bandit_feedback["context"],
                    action=bandit_feedback["action"],
                    reward=bandit_feedback["reward"],
                    n_folds=3,  # 3-fold cross-fitting
                    random_state=random_state,
                )
            else:
                # train regression model on logged bandit feedback data
                estimated_rewards_by_reg_model = regression_model.fit_predict(
                    context=bandit_feedback["context"],
                    action=bandit_feedback["action"],
                    reward=bandit_feedback["reward"],
                    pscore=bandit_feedback["pscore"],
                    position=bandit_feedback["position"],
                    action_dist=action_dist,
                    n_folds=3,  # 3-fold cross-fitting
                    random_state=random_state,
                )
            auc_scores[model_name + "_" + fit_method] = roc_auc_score(
                y_true=bandit_feedback["reward"],
                y_score=estimated_rewards_by_reg_model[
                    np.arange(bandit_feedback["reward"].shape[0]),
                    bandit_feedback["action"],
                    bandit_feedback["position"],
                ],
            )
            # compare dr criteria
            dr_criteria = np.abs((gt_mean - estimated_rewards_by_reg_model)) - np.abs(
                gt_mean
            )
            print(
                f"Dr criteria is satisfied with probability {np.mean(dr_criteria <= 0)} ------ model: {model_name} ({fit_method}),"
            )
            assert (
                np.mean(dr_criteria <= 0) >= dr_criteria_pass_rate
            ), f" should be satisfied with a probability at least {dr_criteria_pass_rate}"

    for model_name in auc_scores:
        print(f"AUC of {model_name} is {auc_scores[model_name]}")
        assert (
            auc_scores[model_name] > 0.5
        ), f"AUC of {model_name} should be greater than 0.5"
Exemple #11
0
 dataset = SyntheticBanditDataset(
     n_actions=n_actions,
     dim_context=dim_context,
     reward_function=logistic_reward_function,
     behavior_policy_function=linear_behavior_policy,
     random_state=random_state,
 )
 # sample new training and test sets of synthetic logged bandit feedback
 bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
     n_rounds=n_rounds)
 bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
     n_rounds=n_rounds)
 # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
 regression_model = RegressionModel(
     n_actions=dataset.n_actions,
     action_context=dataset.action_context,
     base_model=base_model_dict[base_model_for_reg_model](
         **hyperparams[base_model_for_reg_model]),
 )
 estimated_rewards_by_reg_model = regression_model.fit_predict(
     context=bandit_feedback_train["context"],
     action=bandit_feedback_train["action"],
     reward=bandit_feedback_train["reward"],
     n_folds=3,  # 3-fold cross-fitting
     random_state=random_state,
 )
 # define random evaluation policy
 random_policy = Random(n_actions=dataset.n_actions,
                        random_state=random_state)
 # define evaluation policy using IPWLearner
 ipw_learner = IPWLearner(
     n_actions=dataset.n_actions,
Exemple #12
0
    def process(b: int):
        # sample bootstrap from batch logged bandit feedback
        bandit_feedback = obd.sample_bootstrap_bandit_feedback(
            test_size=test_size,
            is_timeseries_split=is_timeseries_split,
            random_state=b,
        )
        # split data into two folds (data for training reg_model and for ope)
        is_for_reg_model = np.random.binomial(
            n=1, p=0.3, size=bandit_feedback["n_rounds"]).astype(bool)
        with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "wb") as f:
            pickle.dump(
                is_for_reg_model,
                f,
            )
        if is_mrdr:
            reg_model = RegressionModel(
                n_actions=obd.n_actions,
                len_list=obd.len_list,
                action_context=bandit_feedback["action_context"],
                base_model=base_model_dict[base_model](
                    **hyperparams[base_model]),
                fitting_method="mrdr",
            )
            # train regression model on logged bandit feedback data
            reg_model.fit(
                context=bandit_feedback["context"][is_for_reg_model],
                action=bandit_feedback["action"][is_for_reg_model],
                reward=bandit_feedback["reward"][is_for_reg_model],
                pscore=bandit_feedback["pscore"][is_for_reg_model],
                position=bandit_feedback["position"][is_for_reg_model],
                action_dist=np.tile(action_dist_single_round,
                                    (is_for_reg_model.sum(), 1, 1)),
            )
            with open(reg_model_path / f"reg_model_mrdr_{b}.pkl", "wb") as f:
                pickle.dump(
                    reg_model,
                    f,
                )
        else:
            reg_model = RegressionModel(
                n_actions=obd.n_actions,
                len_list=obd.len_list,
                action_context=bandit_feedback["action_context"],
                base_model=base_model_dict[base_model](
                    **hyperparams[base_model]),
                fitting_method="normal",
            )
            # train regression model on logged bandit feedback data
            reg_model.fit(
                context=bandit_feedback["context"][is_for_reg_model],
                action=bandit_feedback["action"][is_for_reg_model],
                reward=bandit_feedback["reward"][is_for_reg_model],
                position=bandit_feedback["position"][is_for_reg_model],
            )
            with open(reg_model_path / f"reg_model_{b}.pkl", "wb") as f:
                pickle.dump(
                    reg_model,
                    f,
                )
            # evaluate the estimation performance of the regression model by AUC and RCE
            if is_timeseries_split:
                estimated_rewards_by_reg_model = reg_model.predict(
                    context=bandit_feedback["context_test"], )
            else:
                estimated_rewards_by_reg_model = reg_model.predict(
                    context=bandit_feedback["context"][~is_for_reg_model], )
            performance_reg_model_b = evaluate_reg_model(
                bandit_feedback=bandit_feedback,
                is_timeseries_split=is_timeseries_split,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                is_for_reg_model=is_for_reg_model,
            )

            return performance_reg_model_b
Exemple #13
0
    def process(b: int) -> Dict[str, float]:
        # sample bootstrap from batch logged bandit feedback
        if is_timeseries_split:
            bandit_feedback_train = obd.sample_bootstrap_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
                random_state=b,
            )
            bandit_feedback_test = obd.obtain_batch_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
            )[1]
        else:
            bandit_feedback_train = obd.sample_bootstrap_bandit_feedback(
                random_state=b, )
            bandit_feedback_test = deepcopy(bandit_feedback_train)
            # split data into two folds (data for training reg_model and for ope)
            is_for_reg_model = np.random.binomial(
                n=1, p=0.3,
                size=bandit_feedback_train["n_rounds"]).astype(bool)
            with open(reg_model_path / f"is_for_reg_model_{b}.pkl", "wb") as f:
                pickle.dump(
                    is_for_reg_model,
                    f,
                )
            bandit_feedback_train["n_rounds"] = is_for_reg_model.sum()
            bandit_feedback_test["n_rounds"] = (~is_for_reg_model).sum()
            for key in ["context", "action", "reward", "pscore", "position"]:
                bandit_feedback_train[key] = bandit_feedback_train[key][
                    is_for_reg_model]
                bandit_feedback_test[key] = bandit_feedback_test[key][
                    ~is_for_reg_model]
        model_file_name = f"reg_model_mrdr_{b}.pkl" if is_mrdr else f"reg_model_{b}.pkl"
        reg_model = RegressionModel(
            n_actions=obd.n_actions,
            len_list=obd.len_list,
            action_context=bandit_feedback_train["action_context"],
            base_model=base_model_dict[base_model](**hyperparams[base_model]),
            fitting_method=fitting_method,
        )
        # train regression model on logged bandit feedback data
        reg_model.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            position=bandit_feedback_train["position"],
            action_dist=np.tile(action_dist_single_round,
                                (bandit_feedback_train["n_rounds"], 1, 1)),
        )
        with open(reg_model_path / model_file_name, "wb") as f:
            pickle.dump(
                reg_model,
                f,
            )
        # evaluate the estimation performance of the regression model by AUC and RCE
        estimated_rewards_by_reg_model = reg_model.predict(
            context=bandit_feedback_test["context"], )
        performance_reg_model_b = evaluate_reg_model(
            bandit_feedback=bandit_feedback_test,
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )

        return performance_reg_model_b
Exemple #14
0
     metrics[i]: np.zeros(n_boot_samples)
     for i in np.arange(len(metrics))
 }
 for b in np.arange(n_boot_samples):
     # sample bootstrap samples from batch logged bandit feedback
     boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(
         test_size=test_size,
         is_timeseries_split=is_timeseries_split,
         random_state=b)
     # split data into two folds (data for training reg_model and for ope)
     is_for_reg_model = np.random.binomial(
         n=1, p=0.3, size=boot_bandit_feedback["n_rounds"]).astype(bool)
     # define regression model
     reg_model = RegressionModel(
         n_actions=obd.n_actions,
         len_list=obd.len_list,
         action_context=boot_bandit_feedback["action_context"],
         base_model=base_model_dict[base_model](**hyperparams[base_model]),
     )
     # train regression model on logged bandit feedback data
     reg_model.fit(
         context=boot_bandit_feedback["context"][is_for_reg_model],
         action=boot_bandit_feedback["action"][is_for_reg_model],
         reward=boot_bandit_feedback["reward"][is_for_reg_model],
         position=boot_bandit_feedback["position"][is_for_reg_model],
     )
     # evaluate the estimation performance of the regression model by AUC and RCE
     if is_timeseries_split:
         estimated_reward_by_reg_model = reg_model.predict(
             context=boot_bandit_feedback["context_test"], )
         rewards = boot_bandit_feedback["reward_test"]
         estimated_rewards_ = estimated_reward_by_reg_model[