Python DoublyRobustの例、obp.ope.DoublyRobust Pythonの例

コード例 #1

0

ファイルを表示

def test_dr_init_using_valid_input_data(lambda_: float,
                                        description: str) -> None:
    _ = DoublyRobust(lambda_=lambda_)
    _ = DoublyRobustWithShrinkage(lambda_=lambda_)
    _ = SwitchDoublyRobust(lambda_=lambda_)
    if lambda_ < np.inf:
        _ = SubGaussianDoublyRobust(lambda_=lambda_)

コード例 #2

0

ファイルを表示

def test_dr_init_using_invalid_inputs(
    lambda_,
    use_estimated_pscore,
    err,
    description,
):
    with pytest.raises(err, match=f"{description}*"):
        _ = DoublyRobust(lambda_=lambda_,
                         use_estimated_pscore=use_estimated_pscore)

    with pytest.raises(err, match=f"{description}*"):
        _ = SwitchDoublyRobust(lambda_=lambda_,
                               use_estimated_pscore=use_estimated_pscore)

    with pytest.raises(err, match=f"{description}*"):
        _ = DoublyRobustWithShrinkage(
            lambda_=lambda_, use_estimated_pscore=use_estimated_pscore)

コード例 #3

0

ファイルを表示

def test_dr_using_invalid_input_data(
    action_dist: np.ndarray,
    action: np.ndarray,
    reward: np.ndarray,
    pscore: np.ndarray,
    position: np.ndarray,
    estimated_rewards_by_reg_model: np.ndarray,
    use_estimated_pscore: bool,
    estimated_pscore: np.ndarray,
    description: str,
) -> None:
    dr = DoublyRobust(use_estimated_pscore=use_estimated_pscore)
    dr_tuning = DoublyRobustTuning(
        lambdas=[1, 100],
        estimator_name="dr_tuning",
        use_estimated_pscore=use_estimated_pscore,
    )
    sndr = SelfNormalizedDoublyRobust(
        use_estimated_pscore=use_estimated_pscore)
    # estimate_intervals function raises ValueError of all estimators
    for estimator in [dr, sndr, dr_tuning]:
        with pytest.raises(ValueError, match=f"{description}*"):
            _ = estimator.estimate_policy_value(
                action_dist=action_dist,
                action=action,
                reward=reward,
                pscore=pscore,
                position=position,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                estimated_pscore=estimated_pscore,
            )
        with pytest.raises(ValueError, match=f"{description}*"):
            _ = estimator.estimate_interval(
                action_dist=action_dist,
                action=action,
                reward=reward,
                pscore=pscore,
                position=position,
                estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
                estimated_pscore=estimated_pscore,
            )

コード例 #4

0

ファイルを表示

# hyperparameter for the regression model used in model dependent OPE estimators
with open("./conf/hyperparams.yaml", "rb") as f:
    hyperparams = yaml.safe_load(f)

base_model_dict = dict(
    logistic_regression=LogisticRegression,
    lightgbm=HistGradientBoostingClassifier,
    random_forest=RandomForestClassifier,
)

# compared OPE estimators
ope_estimators = [
    DirectMethod(),
    InverseProbabilityWeighting(),
    SelfNormalizedInverseProbabilityWeighting(),
    DoublyRobust(),
    SelfNormalizedDoublyRobust(),
    SwitchDoublyRobust(tau=1.0, estimator_name="switch-dr (tau=1)"),
    SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"),
    DoublyRobustWithShrinkage(lambda_=1.0, estimator_name="dr-os (lambda=1)"),
    DoublyRobustWithShrinkage(lambda_=100.0, estimator_name="dr-os (lambda=100)"),
]

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="evaluate off-policy estimators with synthetic bandit data."
    )
    parser.add_argument(
        "--n_runs", type=int, default=1, help="number of simulations in the experiment."
    )
    parser.add_argument(

コード例 #5

0

ファイルを表示

    data_path = Path("../open_bandit_dataset")

    obd = OpenBanditDataset(
        behavior_policy=behavior_policy, campaign=campaign, data_path=data_path
    )
    # hyparparameters for counterfactual policies
    kwargs = dict(
        n_actions=obd.n_actions, len_list=obd.len_list, random_state=random_state
    )
    if counterfactual_policy == "bts":
        kwargs["alpha"] = production_prior_for_bts[campaign]["alpha"]
        kwargs["beta"] = production_prior_for_bts[campaign]["beta"]
        kwargs["batch_size"] = production_batch_size_for_bts[campaign]
    policy = counterfactual_policy_dict[counterfactual_policy](**kwargs)
    # compared OPE estimators
    ope_estimators = [DirectMethod(), InverseProbabilityWeighting(), DoublyRobust()]
    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(LogisticRegression(**hyperparams))
    # ground-truth policy value of a counterfactual policy
    # , which is estimated with factual (observed) rewards (on-policy estimation)
    ground_truth_policy_value = OpenBanditDataset.calc_on_policy_policy_value_estimate(
        behavior_policy=counterfactual_policy, campaign=campaign, data_path=data_path
    )

    evaluation_of_ope_results = {
        est.estimator_name: np.zeros(n_boot_samples) for est in ope_estimators
    }
    for b in np.arange(n_boot_samples):
        # sample bootstrap from batch logged bandit feedback
        boot_bandit_feedback = obd.sample_bootstrap_bandit_feedback(random_state=b)
        # run a counterfactual bandit algorithm on logged bandit feedback data

コード例 #6

0

ファイルを表示

    def process(i: int):
        # synthetic data generator
        dataset = SyntheticBanditDataset(
            n_actions=n_actions,
            dim_context=dim_context,
            reward_function=logistic_reward_function,
            behavior_policy_function=linear_behavior_policy,
            random_state=i,
        )
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        regression_model = RegressionModel(
            n_actions=dataset.n_actions,
            action_context=dataset.action_context,
            base_model=base_model_dict[base_model_for_reg_model](
                **hyperparams[base_model_for_reg_model]),
        )
        ope_estimator = DoublyRobust()
        # define evaluation policy using NNPolicyLearner
        nn_policy = NNPolicyLearner(
            n_actions=dataset.n_actions,
            dim_context=dim_context,
            off_policy_objective=ope_estimator.estimate_policy_value_tensor,
        )
        # baseline method 1. RandomPolicy
        random_policy = RandomPolicy(n_actions=dataset.n_actions)
        # baseline method 2. UniformSampleWeightLearner
        uniform_sample_weight_policy = UniformSampleWeightLearner(
            n_actions=dataset.n_actions,
            base_classifier=base_model_dict[base_model_for_evaluation_policy](
                **hyperparams[base_model_for_evaluation_policy]),
        )
        # sample new training and test sets of synthetic logged bandit feedback
        bandit_feedback_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        bandit_feedback_test = dataset.obtain_batch_bandit_feedback(
            n_rounds=n_rounds)
        # estimate the mean reward function of the train set of synthetic bandit feedback with ML model
        estimated_rewards_by_reg_model = regression_model.fit_predict(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            n_folds=3,  # 3-fold cross-fitting
            random_state=12345,
        )
        # train the evaluation policy on the training set of the synthetic logged bandit feedback
        nn_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
            estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        )
        uniform_sample_weight_policy.fit(
            context=bandit_feedback_train["context"],
            action=bandit_feedback_train["action"],
            reward=bandit_feedback_train["reward"],
            pscore=bandit_feedback_train["pscore"],
        )
        # predict the action decisions for the test set of the synthetic logged bandit feedback
        nn_policy_action_dist = nn_policy.predict(
            context=bandit_feedback_test["context"], )
        random_action_dist = random_policy.predict(
            context=bandit_feedback_test["context"], )
        uniform_sample_weight_action_dist = uniform_sample_weight_policy.predict(
            context=bandit_feedback_test["context"], )
        # get the ground truth policy value for each learner
        gt_nn_policy_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=nn_policy_action_dist,
        )
        gt_random_policy = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=random_action_dist,
        )
        gt_uniform_sample_weight_learner = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=uniform_sample_weight_action_dist,
        )

        return gt_nn_policy_learner, gt_random_policy, gt_uniform_sample_weight_learner

コード例 #7

0

ファイルを表示

import pytest
import numpy as np

from obp.types import BanditFeedback
from obp.ope import (
    DirectMethod,
    DoublyRobust,
    DoublyRobustWithShrinkage,
    SwitchDoublyRobust,
    SelfNormalizedDoublyRobust,
)
from conftest import generate_action_dist

# prepare instances
dm = DirectMethod()
dr = DoublyRobust()
dr_shrink_0 = DoublyRobustWithShrinkage(lambda_=0.0)
dr_shrink_max = DoublyRobustWithShrinkage(lambda_=1e10)
sndr = SelfNormalizedDoublyRobust()
switch_dr_0 = SwitchDoublyRobust(tau=0.0)
switch_dr_max = SwitchDoublyRobust(tau=1e10)

dr_estimators = [dr, dr_shrink_0, sndr, switch_dr_0]

# dr and self-normalized dr
# action_dist, action, reward, pscore, position, estimated_rewards_by_reg_model, description
invalid_input_of_dr = [
    (
        generate_action_dist(5, 4, 3),
        None,
        np.zeros(5, dtype=int),

コード例 #8

0

ファイルを表示

        lambdas=lambdas,
        tuning_method=tuning_method,
    )
    _ = SwitchDoublyRobustTuning(
        lambdas=lambdas,
        tuning_method=tuning_method,
    )
    _ = SubGaussianDoublyRobustTuning(
        lambdas=lambdas,
        tuning_method=tuning_method,
    )


# prepare instances
dm = DirectMethod()
dr = DoublyRobust()
dr_tuning_mse = DoublyRobustTuning(lambdas=[1, 100],
                                   tuning_method="mse",
                                   estimator_name="dr_tuning_mse")
dr_tuning_slope = DoublyRobustTuning(lambdas=[1, 100],
                                     tuning_method="slope",
                                     estimator_name="dr_tuning_slope")
dr_os_0 = DoublyRobustWithShrinkage(lambda_=0.0)
dr_os_tuning_mse = DoublyRobustWithShrinkageTuning(
    lambdas=[1, 100], tuning_method="mse", estimator_name="dr_os_tuning_mse")
dr_os_tuning_slope = DoublyRobustWithShrinkageTuning(
    lambdas=[1, 100],
    tuning_method="slope",
    estimator_name="dr_os_tuning_slope")
dr_os_max = DoublyRobustWithShrinkage(lambda_=np.inf)
sndr = SelfNormalizedDoublyRobust()

コード例 #9

0

ファイルを表示

    OffPolicyEvaluation,
    InverseProbabilityWeighting,
    SelfNormalizedInverseProbabilityWeighting,
    DirectMethod,
    DoublyRobust,
    SelfNormalizedDoublyRobust,
    SwitchDoublyRobust,
    DoublyRobustWithShrinkage,
)

# OPE estimators compared
ope_estimators = [
    DirectMethod(),
    InverseProbabilityWeighting(),
    SelfNormalizedInverseProbabilityWeighting(),
    DoublyRobust(),
    DoublyRobust(estimator_name="mrdr"),
    SelfNormalizedDoublyRobust(),
    SwitchDoublyRobust(tau=5.0, estimator_name="switch-dr (tau=5)"),
    SwitchDoublyRobust(tau=10.0, estimator_name="switch-dr (tau=10)"),
    SwitchDoublyRobust(tau=50.0, estimator_name="switch-dr (tau=50)"),
    SwitchDoublyRobust(tau=100.0, estimator_name="switch-dr (tau=100)"),
    SwitchDoublyRobust(tau=500.0, estimator_name="switch-dr (tau=500)"),
    SwitchDoublyRobust(tau=1000.0, estimator_name="switch-dr (tau=1000)"),
    DoublyRobustWithShrinkage(lambda_=5.0, estimator_name="dr-os (lambda=5)"),
    DoublyRobustWithShrinkage(lambda_=10.0,
                              estimator_name="dr-os (lambda=10)"),
    DoublyRobustWithShrinkage(lambda_=50.0,
                              estimator_name="dr-os (lambda=50)"),
    DoublyRobustWithShrinkage(lambda_=100.0,
                              estimator_name="dr-os (lambda=100)"),

コード例 #10

0

ファイルを表示

def main(cfg: DictConfig) -> None:
    print(cfg)
    logger.info(f"The current working directory is {Path().cwd()}")
    start_time = time.time()
    logger.info("initializing experimental condition..")

    # compared ope estimators
    lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"])
    ope_estimators = [
        InverseProbabilityWeighting(estimator_name="IPW"),
        SelfNormalizedInverseProbabilityWeighting(estimator_name="SNIPW"),
        DirectMethod(estimator_name="DM"),
        DoublyRobust(estimator_name="DR"),
        SelfNormalizedDoublyRobust(estimator_name="SNDR"),
        SwitchDoublyRobustTuning(lambdas=lambdas, estimator_name="Switch-DR"),
        DoublyRobustWithShrinkageTuning(lambdas=lambdas,
                                        estimator_name="DRos"),
    ]

    # configurations
    n_seeds = cfg.setting.n_seeds
    sample_size = cfg.setting.sample_size
    reg_model = cfg.setting.reg_model
    campaign = cfg.setting.campaign
    behavior_policy = cfg.setting.behavior_policy
    test_size = cfg.setting.test_size
    is_timeseries_split = cfg.setting.is_timeseries_split
    n_folds = cfg.setting.n_folds
    obd_path = (Path().cwd().parents[5] /
                "open_bandit_dataset" if cfg.setting.is_full_obd else None)
    random_state = cfg.setting.random_state
    np.random.seed(random_state)

    # define dataset
    dataset_ts = OpenBanditDataset(behavior_policy="bts",
                                   campaign=campaign,
                                   data_path=obd_path)
    dataset_ur = OpenBanditDataset(behavior_policy="random",
                                   campaign=campaign,
                                   data_path=obd_path)

    # prepare logged bandit feedback and evaluation policies
    if behavior_policy == "random":
        if is_timeseries_split:
            bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
            )[0]
        else:
            bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback()
        bandit_feedbacks = [bandit_feedback_ur]
        # obtain the ground-truth policy value
        ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate(
            behavior_policy="bts",
            campaign=campaign,
            data_path=obd_path,
            test_size=test_size,
            is_timeseries_split=is_timeseries_split,
        )
        # obtain action choice probabilities and define evaluation policies
        policy_ts = BernoulliTS(
            n_actions=dataset_ts.n_actions,
            len_list=dataset_ts.len_list,
            random_state=random_state,
            is_zozotown_prior=True,
            campaign=campaign,
        )
        action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000)
        evaluation_policies = [(ground_truth_ts, action_dist_ts)]
    else:
        if is_timeseries_split:
            bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
            )[0]
        else:
            bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback()
        bandit_feedbacks = [bandit_feedback_ts]
        # obtain the ground-truth policy value
        ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate(
            behavior_policy="random",
            campaign=campaign,
            data_path=obd_path,
            test_size=test_size,
            is_timeseries_split=is_timeseries_split,
        )
        # obtain action choice probabilities and define evaluation policies
        policy_ur = Random(
            n_actions=dataset_ur.n_actions,
            len_list=dataset_ur.len_list,
            random_state=random_state,
        )
        action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000)
        evaluation_policies = [(ground_truth_ur, action_dist_ur)]

    # regression models used in ope estimators
    hyperparams = dict(cfg.reg_model_hyperparams)[reg_model]
    regression_models = [reg_model_dict[reg_model](**hyperparams)]

    # define an evaluator class
    evaluator = InterpretableOPEEvaluator(
        random_states=np.arange(n_seeds),
        bandit_feedbacks=bandit_feedbacks,
        evaluation_policies=evaluation_policies,
        ope_estimators=ope_estimators,
        regression_models=regression_models,
    )

    # conduct an evaluation of OPE experiment
    logger.info("experiment started")
    _ = evaluator.estimate_policy_value(sample_size=sample_size,
                                        n_folds_=n_folds)
    # calculate statistics
    mean = evaluator.calculate_mean(root=True)
    mean_scaled = evaluator.calculate_mean(scale=True, root=True)

    # save results of the evaluation of off-policy estimators
    log_path = Path("./outputs")
    log_path.mkdir(exist_ok=True, parents=True)
    # save root mse
    root_mse_df = DataFrame()
    root_mse_df["estimator"] = list(mean.keys())
    root_mse_df["mean"] = list(mean.values())
    root_mse_df["mean(scaled)"] = list(mean_scaled.values())
    root_mse_df.to_csv(log_path / "root_mse.csv")
    # conduct pairwise t-tests
    se_df = DataFrame(evaluator.calculate_squared_error())
    se_df = DataFrame(se_df.stack()).reset_index(1)
    se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True)
    nonparam_ttests = (pg.pairwise_ttests(
        data=se_df,
        dv="se",
        parametric=False,
        between="estimators",
    ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1))
    nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv")
    # save reg model metrics
    DataFrame(evaluator.reg_model_metrics).describe().to_csv(
        log_path / "reg_model_metrics.csv")
    # print result
    print(root_mse_df)
    experiment = f"{campaign}-{behavior_policy}-{sample_size}"
    elapsed_time = np.round((time.time() - start_time) / 60, 2)
    logger.info(f"finish experiment {experiment} in {elapsed_time}min")