コード例 #1
0
    def train(self, context=None, action=None, reward=None):
        """ Train the model parameters given contexts and taken actions by the prod policy """
        # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification)
        # So, we need to compute y given the taken action by prod policy and the contexts
        feedback = twoD_gather(reward, action)

        if self._model_type == 'ridge':
            self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5)
        elif self._model_type == 'lasso':
            self._clf = LassoCV(alphas=self._alpha,
                                tol=1e-3,
                                cv=5,
                                fit_intercept=True)
        self._clf.fit(context, feedback)
コード例 #2
0
def single_run(estimators, data_name="ecoli", test_size=0.5):
    """ See Sec 5.1.2 in the paper

    :param data_name: a name of a dataset
    :return reward_est: a dict of estimated rewards by the Estimators of interest
    :return reward_true: a vector of true rewards
    """
    # load the dataset
    data = eval("load_{}()".format(data_name))

    # (Acronym) prod: Production, targ: Target
    x_train, x_test, y_train, y_test = train_test_split(data=data,
                                                        test_size=test_size)

    # Instantiate and train the prod/targ policies on the training set
    prod_policy = UniformPolicy(num_action=data.num_label)
    # prod_policy = DeterministicPolicy2(num_action=data.num_label)
    # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

    # targ_policy = UniformPolicy(num_action=data.num_label)
    targ_policy = DeterministicPolicy2(num_action=data.num_label)
    targ_policy = _train_policy(policy=targ_policy,
                                x_train=x_train,
                                y_train=y_train)

    # let the policies predict on the test set
    prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train)
    prod_a_te, prod_score_te = prod_policy.select_action(context=x_test)
    targ_a_te, targ_score_te = targ_policy.select_action(context=x_test)
    prod_r_te = twoD_gather(y_test, prod_a_te)
    # reward_true = twoD_gather(y_test, targ_a_te)
    reward_true = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1))

    reward_est = dict()

    for name, estimator in estimators.items():
        estimator.train(context=x_train, action=prod_a_tr, reward=y_train)
        _reward_est = estimator.estimate(context=x_test,
                                         prod_r_te=prod_r_te,
                                         prod_a_te=prod_a_te,
                                         targ_a_te=targ_a_te,
                                         prod_score_te=prod_score_te,
                                         targ_score_te=targ_score_te)
        # construct a dict of the estimated rewards
        reward_est[name] = _reward_est
    return reward_est, reward_true
コード例 #3
0
    def train(self, context=None, action=None, reward=None):
        """ Train the model parameters given contexts and taken actions by the prod policy """
        # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification)
        # So, we need to compute y given the taken action by prod policy and the contexts
        feedback = twoD_gather(reward, action)

        if self._model_type == 'ridge':
            self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5)
        elif self._model_type == 'lasso':
            self._clf = LassoCV(alphas=self._alpha,
                                tol=1e-3,
                                cv=5,
                                fit_intercept=True)
        """ This is the part described by the DR paper(Sec 2.1) as follows
             > A problem with this method is that the estimate is formed without the knowledge of a policy
        """
        self._clf.fit(context, feedback)
コード例 #4
0
    def estimate(self,
                 context=None,
                 prod_r_te=None,
                 prod_a_te=None,
                 targ_a_te=None,
                 prod_score_te=None,
                 targ_score_te=None):
        """ Estimate a reward using the inverse propensity score """
        # Apply indicator function
        bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32)

        # take the score only for the taken action
        targ_score = twoD_gather(targ_score_te, targ_a_te)
        prod_score = twoD_gather(prod_score_te, targ_a_te)

        # Avoid the division by Zero error
        targ_score[targ_score == 0.0] = np.spacing(1)
        prod_score[prod_score == 0.0] = np.spacing(1)

        # compute the importance weight
        self.imp_weight = targ_score / prod_score

        if self._if_cap:
            # See Sec4.2 -> https://arxiv.org/pdf/1801.07030.pdf
            self.imp_weight = np.clip(self.imp_weight,
                                      a_min=self._min,
                                      a_max=self._max)

        # replace the infinity with the extremely small value
        self.imp_weight[self.imp_weight == np.inf] = np.spacing(1)
        ips = bool_mat * self.imp_weight

        # replace the infinity with the extremely small value
        ips[ips == np.inf] = np.spacing(1)

        if self._if_normalise:
            # self normalised IPS

            if self._if_pointwise:
                """ Midzuno-Sen Rejection Sampling Method

                    Under this system of selection of probabilities, the unit in the first draw is selected with
                    unequal probabilities of selection and remaining all the units are selected 
                    with simple random sampling without replacement at all subsequent draws.

                    [Ref]
                        Midzuno, H. (1951). On the sampling system with probability proportional
                        to sum of sizes. Ann. Inst. Stat. Math., 3:99–107.
                """
                # 1. Only first unit is selected with unequal probability
                dummy_imp_weight = self.imp_weight.copy()
                u = np.random.uniform(
                    low=0.0,
                    high=1.0)  # TODO: I guess we should use max of imp_weight!
                for _id, x in enumerate(dummy_imp_weight):
                    if u < np.mean(x):
                        first_unit = x
                        break

                # 2. For remaining units, we use Simple Random Sampling
                dummy_imp_weight = dummy_imp_weight[_id:]
                size = dummy_imp_weight.shape[0]
                mask = np.random.binomial(1, p=1 / size, size=size)
                samples = [first_unit] + dummy_imp_weight[mask].tolist()
                norm = np.mean(samples)
            else:
                norm = np.mean(self.imp_weight, axis=0)
        else:
            norm = np.ones(self.imp_weight.shape[-1]).astype(np.float32)

        # estimate the feedback based on the importance sampling
        est = (self.imp_weight * prod_r_te) / norm
        return est
コード例 #5
0
    # prod_policy = DeterministicPolicy2(num_action=data.num_label)
    # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

    # targ_policy = UniformPolicy(num_action=data.num_label)
    targ_policy = DeterministicPolicy2(num_action=data.num_label)
    targ_policy = _train_policy(policy=targ_policy,
                                x_train=x_train,
                                y_train=y_train,
                                x_test=x_test,
                                y_test=y_test)

    # get dummy actions
    prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train)
    prod_a_te, prod_score_te = prod_policy.select_action(context=x_test)
    targ_a_te, targ_score_te = targ_policy.select_action(context=x_test)
    prod_r_te = twoD_gather(y_test, prod_a_te)

    # test the estimator
    dm = DM(model_type="ridge")
    dm.train(context=x_train, action=prod_a_tr, reward=y_train)
    dm_est = dm.estimate(context=x_test)
    ground_truth = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1))
    print("[DM] RMSE: {}".format(rmse(a=np.mean(dm_est), b=ground_truth)))

    bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32)
    # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, prod_a_te))
    # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, targ_a_te))
    imp_weight = (twoD_gather(targ_score_te, targ_a_te) /
                  twoD_gather(prod_score_te, targ_a_te))
    ips_est = prod_r_te * (bool_mat * imp_weight)
    ips_est = np.mean(ips_est)