def train(self, context=None, action=None, reward=None): """ Train the model parameters given contexts and taken actions by the prod policy """ # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification) # So, we need to compute y given the taken action by prod policy and the contexts feedback = twoD_gather(reward, action) if self._model_type == 'ridge': self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5) elif self._model_type == 'lasso': self._clf = LassoCV(alphas=self._alpha, tol=1e-3, cv=5, fit_intercept=True) self._clf.fit(context, feedback)
def single_run(estimators, data_name="ecoli", test_size=0.5): """ See Sec 5.1.2 in the paper :param data_name: a name of a dataset :return reward_est: a dict of estimated rewards by the Estimators of interest :return reward_true: a vector of true rewards """ # load the dataset data = eval("load_{}()".format(data_name)) # (Acronym) prod: Production, targ: Target x_train, x_test, y_train, y_test = train_test_split(data=data, test_size=test_size) # Instantiate and train the prod/targ policies on the training set prod_policy = UniformPolicy(num_action=data.num_label) # prod_policy = DeterministicPolicy2(num_action=data.num_label) # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) # targ_policy = UniformPolicy(num_action=data.num_label) targ_policy = DeterministicPolicy2(num_action=data.num_label) targ_policy = _train_policy(policy=targ_policy, x_train=x_train, y_train=y_train) # let the policies predict on the test set prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train) prod_a_te, prod_score_te = prod_policy.select_action(context=x_test) targ_a_te, targ_score_te = targ_policy.select_action(context=x_test) prod_r_te = twoD_gather(y_test, prod_a_te) # reward_true = twoD_gather(y_test, targ_a_te) reward_true = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1)) reward_est = dict() for name, estimator in estimators.items(): estimator.train(context=x_train, action=prod_a_tr, reward=y_train) _reward_est = estimator.estimate(context=x_test, prod_r_te=prod_r_te, prod_a_te=prod_a_te, targ_a_te=targ_a_te, prod_score_te=prod_score_te, targ_score_te=targ_score_te) # construct a dict of the estimated rewards reward_est[name] = _reward_est return reward_est, reward_true
def train(self, context=None, action=None, reward=None): """ Train the model parameters given contexts and taken actions by the prod policy """ # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification) # So, we need to compute y given the taken action by prod policy and the contexts feedback = twoD_gather(reward, action) if self._model_type == 'ridge': self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5) elif self._model_type == 'lasso': self._clf = LassoCV(alphas=self._alpha, tol=1e-3, cv=5, fit_intercept=True) """ This is the part described by the DR paper(Sec 2.1) as follows > A problem with this method is that the estimate is formed without the knowledge of a policy """ self._clf.fit(context, feedback)
def estimate(self, context=None, prod_r_te=None, prod_a_te=None, targ_a_te=None, prod_score_te=None, targ_score_te=None): """ Estimate a reward using the inverse propensity score """ # Apply indicator function bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32) # take the score only for the taken action targ_score = twoD_gather(targ_score_te, targ_a_te) prod_score = twoD_gather(prod_score_te, targ_a_te) # Avoid the division by Zero error targ_score[targ_score == 0.0] = np.spacing(1) prod_score[prod_score == 0.0] = np.spacing(1) # compute the importance weight self.imp_weight = targ_score / prod_score if self._if_cap: # See Sec4.2 -> https://arxiv.org/pdf/1801.07030.pdf self.imp_weight = np.clip(self.imp_weight, a_min=self._min, a_max=self._max) # replace the infinity with the extremely small value self.imp_weight[self.imp_weight == np.inf] = np.spacing(1) ips = bool_mat * self.imp_weight # replace the infinity with the extremely small value ips[ips == np.inf] = np.spacing(1) if self._if_normalise: # self normalised IPS if self._if_pointwise: """ Midzuno-Sen Rejection Sampling Method Under this system of selection of probabilities, the unit in the first draw is selected with unequal probabilities of selection and remaining all the units are selected with simple random sampling without replacement at all subsequent draws. [Ref] Midzuno, H. (1951). On the sampling system with probability proportional to sum of sizes. Ann. Inst. Stat. Math., 3:99–107. """ # 1. Only first unit is selected with unequal probability dummy_imp_weight = self.imp_weight.copy() u = np.random.uniform( low=0.0, high=1.0) # TODO: I guess we should use max of imp_weight! for _id, x in enumerate(dummy_imp_weight): if u < np.mean(x): first_unit = x break # 2. For remaining units, we use Simple Random Sampling dummy_imp_weight = dummy_imp_weight[_id:] size = dummy_imp_weight.shape[0] mask = np.random.binomial(1, p=1 / size, size=size) samples = [first_unit] + dummy_imp_weight[mask].tolist() norm = np.mean(samples) else: norm = np.mean(self.imp_weight, axis=0) else: norm = np.ones(self.imp_weight.shape[-1]).astype(np.float32) # estimate the feedback based on the importance sampling est = (self.imp_weight * prod_r_te) / norm return est
# prod_policy = DeterministicPolicy2(num_action=data.num_label) # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) # targ_policy = UniformPolicy(num_action=data.num_label) targ_policy = DeterministicPolicy2(num_action=data.num_label) targ_policy = _train_policy(policy=targ_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) # get dummy actions prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train) prod_a_te, prod_score_te = prod_policy.select_action(context=x_test) targ_a_te, targ_score_te = targ_policy.select_action(context=x_test) prod_r_te = twoD_gather(y_test, prod_a_te) # test the estimator dm = DM(model_type="ridge") dm.train(context=x_train, action=prod_a_tr, reward=y_train) dm_est = dm.estimate(context=x_test) ground_truth = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1)) print("[DM] RMSE: {}".format(rmse(a=np.mean(dm_est), b=ground_truth))) bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32) # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, prod_a_te)) # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, targ_a_te)) imp_weight = (twoD_gather(targ_score_te, targ_a_te) / twoD_gather(prod_score_te, targ_a_te)) ips_est = prod_r_te * (bool_mat * imp_weight) ips_est = np.mean(ips_est)