Python twoD_gatherの例

プログラミング言語: Python

名前空間/パッケージ名: utils

メソッド/関数: twoD_gather

hotexamples.comのコード掲載数: 5

Python twoD_gather - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのutils.twoD_gatherの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

    def train(self, context=None, action=None, reward=None):
        """ Train the model parameters given contexts and taken actions by the prod policy """
        # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification)
        # So, we need to compute y given the taken action by prod policy and the contexts
        feedback = twoD_gather(reward, action)

        if self._model_type == 'ridge':
            self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5)
        elif self._model_type == 'lasso':
            self._clf = LassoCV(alphas=self._alpha,
                                tol=1e-3,
                                cv=5,
                                fit_intercept=True)
        self._clf.fit(context, feedback)

コード例 #2

ファイルを表示

ファイル: main.py プロジェクト: kiminh/offline_policy_evaluation

def single_run(estimators, data_name="ecoli", test_size=0.5):
    """ See Sec 5.1.2 in the paper

    :param data_name: a name of a dataset
    :return reward_est: a dict of estimated rewards by the Estimators of interest
    :return reward_true: a vector of true rewards
    """
    # load the dataset
    data = eval("load_{}()".format(data_name))

    # (Acronym) prod: Production, targ: Target
    x_train, x_test, y_train, y_test = train_test_split(data=data,
                                                        test_size=test_size)

    # Instantiate and train the prod/targ policies on the training set
    prod_policy = UniformPolicy(num_action=data.num_label)
    # prod_policy = DeterministicPolicy2(num_action=data.num_label)
    # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

    # targ_policy = UniformPolicy(num_action=data.num_label)
    targ_policy = DeterministicPolicy2(num_action=data.num_label)
    targ_policy = _train_policy(policy=targ_policy,
                                x_train=x_train,
                                y_train=y_train)

    # let the policies predict on the test set
    prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train)
    prod_a_te, prod_score_te = prod_policy.select_action(context=x_test)
    targ_a_te, targ_score_te = targ_policy.select_action(context=x_test)
    prod_r_te = twoD_gather(y_test, prod_a_te)
    # reward_true = twoD_gather(y_test, targ_a_te)
    reward_true = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1))

    reward_est = dict()

    for name, estimator in estimators.items():
        estimator.train(context=x_train, action=prod_a_tr, reward=y_train)
        _reward_est = estimator.estimate(context=x_test,
                                         prod_r_te=prod_r_te,
                                         prod_a_te=prod_a_te,
                                         targ_a_te=targ_a_te,
                                         prod_score_te=prod_score_te,
                                         targ_score_te=targ_score_te)
        # construct a dict of the estimated rewards
        reward_est[name] = _reward_est
    return reward_est, reward_true

コード例 #3

ファイルを表示

    def train(self, context=None, action=None, reward=None):
        """ Train the model parameters given contexts and taken actions by the prod policy """
        # Rewards represent the cost of taking actions(e.g., Cost-Sensitive Classification)
        # So, we need to compute y given the taken action by prod policy and the contexts
        feedback = twoD_gather(reward, action)

        if self._model_type == 'ridge':
            self._clf = RidgeCV(alphas=self._alpha, fit_intercept=True, cv=5)
        elif self._model_type == 'lasso':
            self._clf = LassoCV(alphas=self._alpha,
                                tol=1e-3,
                                cv=5,
                                fit_intercept=True)
        """ This is the part described by the DR paper(Sec 2.1) as follows
             > A problem with this method is that the estimate is formed without the knowledge of a policy
        """
        self._clf.fit(context, feedback)

コード例 #4

ファイルを表示

    def estimate(self,
                 context=None,
                 prod_r_te=None,
                 prod_a_te=None,
                 targ_a_te=None,
                 prod_score_te=None,
                 targ_score_te=None):
        """ Estimate a reward using the inverse propensity score """
        # Apply indicator function
        bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32)

        # take the score only for the taken action
        targ_score = twoD_gather(targ_score_te, targ_a_te)
        prod_score = twoD_gather(prod_score_te, targ_a_te)

        # Avoid the division by Zero error
        targ_score[targ_score == 0.0] = np.spacing(1)
        prod_score[prod_score == 0.0] = np.spacing(1)

        # compute the importance weight
        self.imp_weight = targ_score / prod_score

        if self._if_cap:
            # See Sec4.2 -> https://arxiv.org/pdf/1801.07030.pdf
            self.imp_weight = np.clip(self.imp_weight,
                                      a_min=self._min,
                                      a_max=self._max)

        # replace the infinity with the extremely small value
        self.imp_weight[self.imp_weight == np.inf] = np.spacing(1)
        ips = bool_mat * self.imp_weight

        # replace the infinity with the extremely small value
        ips[ips == np.inf] = np.spacing(1)

        if self._if_normalise:
            # self normalised IPS

            if self._if_pointwise:
                """ Midzuno-Sen Rejection Sampling Method

                    Under this system of selection of probabilities, the unit in the first draw is selected with
                    unequal probabilities of selection and remaining all the units are selected 
                    with simple random sampling without replacement at all subsequent draws.

                    [Ref]
                        Midzuno, H. (1951). On the sampling system with probability proportional
                        to sum of sizes. Ann. Inst. Stat. Math., 3:99–107.
                """
                # 1. Only first unit is selected with unequal probability
                dummy_imp_weight = self.imp_weight.copy()
                u = np.random.uniform(
                    low=0.0,
                    high=1.0)  # TODO: I guess we should use max of imp_weight!
                for _id, x in enumerate(dummy_imp_weight):
                    if u < np.mean(x):
                        first_unit = x
                        break

                # 2. For remaining units, we use Simple Random Sampling
                dummy_imp_weight = dummy_imp_weight[_id:]
                size = dummy_imp_weight.shape[0]
                mask = np.random.binomial(1, p=1 / size, size=size)
                samples = [first_unit] + dummy_imp_weight[mask].tolist()
                norm = np.mean(samples)
            else:
                norm = np.mean(self.imp_weight, axis=0)
        else:
            norm = np.ones(self.imp_weight.shape[-1]).astype(np.float32)

        # estimate the feedback based on the importance sampling
        est = (self.imp_weight * prod_r_te) / norm
        return est

コード例 #5

ファイルを表示

    # prod_policy = DeterministicPolicy2(num_action=data.num_label)
    # prod_policy = _train_policy(policy=prod_policy, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

    # targ_policy = UniformPolicy(num_action=data.num_label)
    targ_policy = DeterministicPolicy2(num_action=data.num_label)
    targ_policy = _train_policy(policy=targ_policy,
                                x_train=x_train,
                                y_train=y_train,
                                x_test=x_test,
                                y_test=y_test)

    # get dummy actions
    prod_a_tr, prod_score_tr = prod_policy.select_action(context=x_train)
    prod_a_te, prod_score_te = prod_policy.select_action(context=x_test)
    targ_a_te, targ_score_te = targ_policy.select_action(context=x_test)
    prod_r_te = twoD_gather(y_test, prod_a_te)

    # test the estimator
    dm = DM(model_type="ridge")
    dm.train(context=x_train, action=prod_a_tr, reward=y_train)
    dm_est = dm.estimate(context=x_test)
    ground_truth = 1 - np.mean(targ_a_te == np.argmax(y_test, axis=-1))
    print("[DM] RMSE: {}".format(rmse(a=np.mean(dm_est), b=ground_truth)))

    bool_mat = np.asarray(prod_a_te == targ_a_te).astype(np.float32)
    # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, prod_a_te))
    # ips_est = prod_r_te * (bool_mat / twoD_gather(prod_score_te, targ_a_te))
    imp_weight = (twoD_gather(targ_score_te, targ_a_te) /
                  twoD_gather(prod_score_te, targ_a_te))
    ips_est = prod_r_te * (bool_mat * imp_weight)
    ips_est = np.mean(ips_est)