コード例 #1
0
 def __init__(self, m, means):
     self.m = m
     self.K = len(means)
     self.mu_hat = np.zeros(self.K)
     self.T = np.zeros(self.K)
     self.means = means
     self.bandit = BernoulliBandit(means=self.means)
コード例 #2
0
 def __init__(self, m, means, delta):
     self.m = m
     self.k = len(means)
     self.mu_hat = np.zeros(self.k)
     self.UCB = np.zeros(self.k)
     self.T = np.zeros(self.k)
     self.means = means
     self.delta = delta
     self.bandit = BernoulliBandit(means=self.means)
コード例 #3
0
class KL_UCB(object):
    def __init__(self, m, means):
        self.m = m
        self.k = len(means)
        self.mu_hat = np.zeros(self.k)
        self.estimate = np.zeros(self.k)
        self.T = np.zeros(self.k)
        self.means = means
        self.bandit = BernoulliBandit(means=self.means)

    def run_KL_UCB(self, n):
        for t in range(n):
            for i in range(self.k):
                if self.T[i] == 0:
                    self.estimate[i] = 10**10
                else:
                    thresh = np.log2(f(t)) / self.T[i]
                    self.estimate[i] = -1
                    for j in range(100):
                        mu_tilde = j / 100
                        tmp = d(self.mu_hat[i], mu_tilde)
                        if tmp <= thresh:
                            self.estimate[i] = mu_tilde

            a_t = np.argmax(self.estimate)
            reward_t = self.bandit.pull(a_t)

            self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] +
                                reward_t) / (self.T[a_t] + 1)
            self.T[a_t] += 1

    def update_mean(self, a_t, reward_t):
        self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] +
                            reward_t) / (self.T[a_t] + 1)
コード例 #4
0
class Explore_then_commit(object):
    def __init__(self, m, means):
        self.m = m
        self.K = len(means)
        self.mu_hat = np.zeros(self.K)
        self.T = np.zeros(self.K)
        self.means = means
        self.bandit = BernoulliBandit(means=self.means)

    def run_ETC(self, n):
        for t in range(n):
            if t <= self.m * self.K:
                a_t = (t % self.K)
            else:
                a_t = np.argmax(self.mu_hat)
            reward_t = self.bandit.pull(a_t)
            if t <= self.m * self.K:
                self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] +
                                    reward_t) / (self.T[a_t] + 1)
                self.T[a_t] += 1
コード例 #5
0
class Minimax_optimal_Strategy_in_stochastic_case(object):
    def __init__(self, m, means):
        self.m = m
        self.K = len(means)
        self.mu_hat = np.zeros(self.K)
        self.estimate = np.zeros(self.K)
        self.T = np.zeros(self.K)
        self.means = means
        self.bandit = BernoulliBandit(means=self.means)

    def run_Moss(self, n):
        for t in range(n):
            for i in range(self.K):
                if self.T[i] == 0:
                    self.estimate[i] = 10 ** 10
                else:
                    self.estimate[i] = self.mu_hat[i] + np.sqrt(4*log_star(N/self.K * self.T[i])/self.T[i])
            a_t = np.argmax(self.estimate)
            reward_t = self.bandit.pull(a_t)

            self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] + reward_t)/(self.T[a_t] + 1)
            self.T[a_t] += 1
コード例 #6
0
class Upper_Confidence_bound(object):
    def __init__(self, m, means, delta):
        self.m = m
        self.k = len(means)
        self.mu_hat = np.zeros(self.k)
        self.UCB = np.zeros(self.k)
        self.T = np.zeros(self.k)
        self.means = means
        self.delta = delta
        self.bandit = BernoulliBandit(means=self.means)

    def run_UCB(self, n):
        for t in range(n):
            a_t = np.argmax(self.UCB)
            reward_t = self.bandit.pull(a_t)
            for i in range(self.k):
                if self.T[i] == 0:
                    self.UCB[i] = 10**10
                else:
                    self.UCB[i] = self.mu_hat[i] + np.sqrt(
                        2 * np.log2(1 / self.delta) / self.T[i])
            self.mu_hat[a_t] = (self.mu_hat[a_t] * self.T[a_t] +
                                reward_t) / (self.T[a_t] + 1)
            self.T[a_t] += 1