Beispiel #1
0
 def choiceWithRank(self, rank=1):
     r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm of a certain rank."""
     if rank == 1:
         return self.choice()
     else:
         assert rank >= 1, "Error: for AdBandits = {}, in choiceWithRank(rank={}) rank has to be >= 1.".format(
             self, rank)
         # Thompson Exploration
         if with_proba(1 - self.epsilon):  # with proba 1-epsilon
             indexes = [
                 self.posterior[i].sample() for i in range(self.nbArms)
             ]
         # UCB-Bayes
         else:
             expectations = (1.0 + self.rewards) / (2.0 + self.pulls)
             upperbounds = [
                 self.posterior[arm].quantile(1. - 1. / self.t)
                 for arm in range(self.nbArms)
             ]
             indexes = expectations - np.max(upperbounds)
         # We computed the indexes, OK let's use them
         sortedRewards = np.sort(
             indexes
         )  # XXX What happens here if two arms has the same index, being the max?
         chosenIndex = sortedRewards[-rank]
         # Uniform choice among the rank-th best arms
         return choice(np.nonzero(indexes == chosenIndex)[0])
Beispiel #2
0
 def choice(self):
     """ Choose an arm, as described by the MEGA algorithm."""
     self.t += 1
     if self.chosenArm is not None:  # We can still exploit that arm
         return self.chosenArm
     else:  # We have to chose a new arm
         # Identify available arms
         availableArms = np.nonzero(self.tnext <= self.t)[0]
         if len(availableArms) == 0:
             print("Error: MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET ... Choosing a random arm.")  # DEBUG
             self.chosenArm = rn.randint(self.nbArms)  # XXX Choose a random arm
             # raise ValueError("FIXME MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET")
         else:  # There is some available arms
             epsilon = self._epsilon_t()
             if with_proba(epsilon):  # With proba epsilon_t
                 newArm = rn.choice(availableArms)  # Explore valid arms
                 if self.chosenArm != newArm:
                     self.p = self.p0  # Reinitialize proba p
             else:  # Exploit: select the arm with highest meanRewards
                 self.meanRewards[self.pulls != 0] = self.rewards[self.pulls != 0] / self.pulls[self.pulls != 0]
                 # newArm = np.argmax(self.meanRewards)
                 # Uniformly chosen if more than one arm has the highest index, but that's unlikely
                 newArm = np.random.choice(np.nonzero(self.meanRewards == np.max(self.meanRewards))[0])
             self.chosenArm = newArm
         return self.chosenArm
Beispiel #3
0
    def getReward(self, arm, reward):
        """ Give reward for each child, and then update the trust probabilities."""
        reward = float(reward)
        new_reward = renormalize_reward(reward, lower=self.lower, amplitude=self.amplitude, unbiased=False)
        # print("  A LearnExp player {} received a reward = {:.3g} on arm {} and trust = {:.3g} on that choice = {}, giving {:.3g} ...".format(self, reward, arm, self.trusts[self.last_choice], self.last_choice, new_reward))  # DEBUG

        # 1. First, give rewards to that slave, with probability rate / trusts
        probability = self.rate / self.trusts[self.last_choice]
        assert 0 <= probability <= 1, "Error: 'probability' = {:.3g} = rate = {:.3g} / trust_j^t = {:.3g} should have been in [0, 1]...".format(probability, self.rate, self.trusts[self.last_choice])  # DEBUG
        if with_proba(probability):
            self.children[self.last_choice].getReward(arm, reward)

        # 2. Then reinitialize this array of losses
        assert 0 <= new_reward <= 1, "Error: the normalized reward {:.3g} was NOT in [0, 1] ...".format(new_reward)  # DEBUG
        loss = (1 - new_reward)
        if self.unbiased:
            loss /= self.trusts[self.last_choice]

        # 3. Update weight of that slave
        self.weights[self.last_choice] *= np.exp(- self.rate * loss)

        # 4. Recomputed the trusts from the weights
        # add uniform mixing of proportion rate=eta/N
        self.trusts = (1 - self.eta) * (self.weights / np.sum(self.weights)) + self.rate
        # self.trusts = trusts / np.sum(trusts)  # XXX maybe this isn't necessary...

        # print("  The most trusted child policy is the {}th with confidence {}...".format(1 + np.argmax(self.trusts), np.max(self.trusts)))  # DEBUG
        assert np.isclose(np.sum(self.trusts), 1), "Error: 'trusts' do not sum to 1 but to {:.3g} instead...".format(np.sum(self.trusts))  # DEBUG
 def choice(self):
     """With a probability of epsilon, explore (uniform choice), otherwhise exploit based on just accumulated *rewards* (not empirical mean rewards)."""
     if with_proba(self.epsilon):  # Proba epsilon : explore
         return rn.randint(0, self.nbArms)
     else:  # Proba 1 - epsilon : exploit
         # Uniform choice among the best arms
         biased_means = self.rewards / (1 + self.pulls)
         return np.random.choice(np.nonzero(biased_means == np.max(biased_means))[0])
Beispiel #5
0
    def choice(self):
        """ Choose an arm following the different phase of growing lengths according to the AdSwitchNew algorithm."""
        # 1. Add checks for bad arms:
        for bad_arm in self.set_BAD:
            gap_Delta_hat_of_l_a = self.gap_Delta_tilde_of_l[bad_arm]
            for i in range(1, self.find_max_i(gap_Delta_hat_of_l_a) + 1):
                # assert 2**(-i) >= gap_Delta_hat_of_l_a/16  # DEBUG
                # ell, K, T = self.ell, self.nbArms, self.horizon
                probability_to_add_this_triplet = 2**(-i) * np.sqrt(
                    self.ell /
                    (self.nbArms * self.horizon * np.log(self.horizon)))
                print(
                    "AdSwitchNew: for bad_arm = {}, gap Delta = {}, and i = {}, a new triplet can be added to the set S with probability = {}."
                    .format(bad_arm, gap_Delta_hat_of_l_a, i,
                            probability_to_add_this_triplet))  # DEBUG
                if with_proba(probability_to_add_this_triplet):
                    triplet = (2**(-i),
                               np.floor(2**(2 * i + 1) * np.log(self.horizon)),
                               self.t)
                    print(
                        "\nAdSwitchNew: for bad_arm = {}, gap Delta = {}, and i = {}, the triplet = {} was added to the set S with probability = {}."
                        .format(bad_arm, gap_Delta_hat_of_l_a, i, triplet,
                                probability_to_add_this_triplet))  # DEBUG
                    self.set_S[bad_arm].add(triplet)
                    print("    self.set_S[bad_arm] =",
                          self.set_S[bad_arm])  # DEBUG

        # 2. Select an arm:
        these_times_taus = [float('+inf') for arm in range(self.nbArms)]
        for arm in self.set_GOOD | {
                a
                for a in range(self.nbArms) if self.set_S[a]
        }:
            print(
                "AdSwitchNew: for arm = {}, in GOOD_(t) = {} or with set S_t(a) = {} not empty, at time t = {}."
                .format(arm, self.set_GOOD, self.set_S[arm], self.t))  # DEBUG

            look_ahead_in_past = 1
            while look_ahead_in_past < len(
                    self.history_of_plays
            ) and self.history_of_plays[-look_ahead_in_past] != arm:
                look_ahead_in_past += 1
            these_times_taus[arm] = self.t - look_ahead_in_past
            print(
                "\nAdSwitchNew: for arm = {}, this time tau = {}, and t = {}, look ahead in past (t - min t') = {}."
                .format(arm, these_times_taus[arm], self.t,
                        look_ahead_in_past))  # DEBUG

        chosen_arm = np.argmin(these_times_taus)
        self.history_of_plays.append(chosen_arm)
        if not np.all(np.isinf(these_times_taus)):
            print(
                "AdSwitchNew: for time t = {}, choosing {} = arg min {} non all = +inf, adding to history of plays..."
                .format(self.t, chosen_arm, these_times_taus))  # DEBUG

        return chosen_arm
 def choiceMultiple(self, nb=1):
     if nb == 1:
         return np.array([self.choice()])
     else:
         # FIXME the explore/exploit balance should be for each choice, right?
         if with_proba(self.epsilon):  # Proba epsilon : Explore
             return rn.choice(self.nbArms, size=nb, replace=False)
         else:  # Proba 1 - epsilon : exploit
             sortedRewards = np.sort(self.rewards)
             # Uniform choice among the best arms
             return rn.choice(np.nonzero(self.rewards >= sortedRewards[-nb])[0], size=nb, replace=False)
 def choiceWithRank(self, rank=1):
     """With a probability of epsilon, explore (uniform choice), otherwhise exploit with the rank, based on just accumulated *rewards* (not empirical mean rewards)."""
     if rank == 1:
         return self.choice()
     else:
         if with_proba(self.epsilon):  # Proba epsilon : explore
             return rn.randint(0, self.nbArms - 1)
         else:  # Proba 1 - epsilon : exploit
             sortedRewards = np.sort(self.rewards)
             chosenIndex = sortedRewards[-rank]
             # Uniform choice among the rank-th best arms
             return rn.choice(np.nonzero(self.rewards == chosenIndex)[0])
Beispiel #8
0
 def handleCollision(self, arm, reward=None):
     """ Get a new fully random rank, and give reward to the algorithm if not None."""
     # rhoRandALOHA UCB indexes learn on the SENSING, not on the successful transmissions!
     if reward is not None:
         # print("Info: rhoRandALOHA UCB internal indexes DOES get updated by reward, in case of collision, learning is done on SENSING, not successful transmissions!")  # DEBUG
         super(oneRhoRandALOHA, self).getReward(arm, reward)
     # 1. With probability 1-p, change the rank
     if with_proba(1. - self.p):
         self.rank = new_rank(self.rank, self.maxRank, self.forceChange)  # New random rank, can be forced to be new or not.
         # print(" - A oneRhoRandALOHA player {} saw a collision, so she had to select a new random rank : {} ...".format(self, self.rank))  # DEBUG
         # print(" - A oneRhoRandALOHA player {} saw a collision, so she reinitialized her probability p from {:.5g} to {:.5g}...".format(self, self.p, self.p0))  # DEBUG
         self.p = self.p0  # Reinitialize the proba p
Beispiel #9
0
def bernoulliBinarization(r_t):
    """ 
    Return a (random) binarization of a reward :math:`r_t`, in the continuous interval :math:`[0, 1]` as an observation in discrete :math:`{0, 1}`.
    - Useful to allow to use a Beta posterior for non-Bernoulli experiments,
    - That way, :class:`Thompson` sampling can be used for any continuous-valued bounded rewards.
    """
    if r_t == 0:
        return 0  # Returns a int!
    elif r_t == 1:
        return 1  # Returns a int!
    else:
        assert 0 <= r_t <= 1, "Error: only bounded rewards in [0, 1] are supported by this Beta posterior right now."
        return int(with_proba(r_t))
 def choiceFromSubSet(self, availableArms='all'):
     if (availableArms == 'all') or (len(availableArms) == self.nbArms):
         return self.choice()
     elif len(availableArms) == 0:
         print("WARNING: EpsilonGreedy.choiceFromSubSet({}): the argument availableArms of type {} should not be empty.".format(availableArms, type(availableArms)))  # DEBUG
         # WARNING if no arms are tagged as available, what to do ? choose an arm at random, or call choice() as if available == 'all'
         return self.choice()
         # return np.random.randint(self.nbArms)
     else:
         if with_proba(self.epsilon):  # Proba epsilon : explore
             return rn.choice(availableArms)
         else:  # Proba 1 - epsilon : exploit
             # Uniform choice among the best arms
             return rn.choice(np.nonzero(self.rewards[availableArms] == np.max(self.rewards[availableArms]))[0])
Beispiel #11
0
    def handleCollision(self, arm, reward=None):
        """ Handle a collision, on arm of index 'arm'.

        .. warning:: This method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers.

        .. note:: We do not care on which arm the collision occured.
        """
        # print(" ---------> A oneALOHA player saw a collision on arm {}, at time t = {} ... Currently, p = {} ...".format(arm, self.t, self.p))  # DEBUG
        # self.getReward(arm, self.mother.lower)  # FIXED should we give a 0 reward ? Not in this model!
        # 1. With proba 1 - p, give up
        if with_proba(1 - self.p):
            # Random time offset until when this arm self.chosenArm is not sampled
            delta_tnext_k = rn.randint(low=0,
                                       high=1 + int(self.ftnext(self.t)))
            self.tnext[self.chosenArm] = self.t + 1 + delta_tnext_k
            # print("   - Reaction to collision on arm {}, at time t = {} : delta_tnext_k = {}, tnext[{}] = {} ...".format(arm, self.t, delta_tnext_k, self.chosenArm, self.tnext[self.chosenArm]))  # DEBUG
            self.p = self.p0  # Reinitialize the proba p
            self.chosenArm = None  # We give up this arm
Beispiel #12
0
    def handleCollision(self, arm, reward=None):
        """ Handle a collision, on arm of index 'arm'.

        - Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers.

        .. note:: We do not care on which arm the collision occured.

        """
        assert self.chosenArm == arm, "Error: a MEGA player can only see a collision on her chosenArm. Here, arm = {} != chosenArm = {} ...".format(arm, self.chosenArm)  # DEBUG
        # print("- A MEGA player saw a collision on arm {}, and time t = {} ...".format(arm, self.t))  # DEBUG
        # # 1. With proba p, persist  XXX useless code
        # # if with_proba(self.p):
        # #     self.chosenArm = self.chosenArm
        # 2. With proba 1 - p, give up
        if with_proba(1 - self.p):
            # Random time offset until when this arm self.chosenArm is not sampled
            delta_tnext_k = rn.randint(low=0, high=1 + int(self.t**self.beta))
            self.tnext[self.chosenArm] = self.t + delta_tnext_k
            # Reinitialize the proba p
            self.p = self.p0
            # We give up this arm
            self.chosenArm = None
Beispiel #13
0
 def choice(self):
     r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm."""
     # Thompson Exploration
     if with_proba(1 - self.epsilon):  # with proba 1-epsilon
         upperbounds = [
             self.posterior[i].sample() for i in range(self.nbArms)
         ]
         maxIndex = max(upperbounds)
         bestArms = [
             arm for (arm, index) in enumerate(upperbounds)
             if index == maxIndex
         ]
         arm = choice(bestArms)
     # UCB-Bayes
     else:
         expectations = (1.0 + self.rewards) / (2.0 + self.pulls)
         upperbounds = [
             self.posterior[arm].quantile(1. - 1. / self.t)
             for arm in range(self.nbArms)
         ]
         regret = np.max(upperbounds) - expectations
         admissible = np.nonzero(regret == np.min(regret))[0]
         arm = choice(admissible)
     return arm
Beispiel #14
0
def DepRound(weights_p, k=1):
    r""" [[Algorithms for adversarial bandit problems with multiple plays, by T.Uchiya, A.Nakamura and M.Kudo, 2010](http://hdl.handle.net/2115/47057)] Figure 5 (page 15) is a very clean presentation of the algorithm.

    - Inputs: :math:`k < K` and weights_p :math:`= (p_1, \dots, p_K)` such that :math:`\sum_{i=1}^{K} p_i = k` (or :math:`= 1`).
    - Output: A subset of :math:`\{1,\dots,K\}` with exactly :math:`k` elements. Each action :math:`i` is selected with probability exactly :math:`p_i`.

    Example:

    >>> import numpy as np; import random
    >>> np.random.seed(0); random.seed(0)  # for reproductibility!
    >>> K = 5
    >>> k = 2

    >>> weights_p = [ 2, 2, 2, 2, 2 ]  # all equal weights
    >>> DepRound(weights_p, k)
    [3, 4]
    >>> DepRound(weights_p, k)
    [3, 4]
    >>> DepRound(weights_p, k)
    [0, 1]

    >>> weights_p = [ 10, 8, 6, 4, 2 ]  # decreasing weights
    >>> DepRound(weights_p, k)
    [0, 4]
    >>> DepRound(weights_p, k)
    [1, 2]
    >>> DepRound(weights_p, k)
    [3, 4]

    >>> weights_p = [ 3, 3, 0, 0, 3 ]  # decreasing weights
    >>> DepRound(weights_p, k)
    [0, 4]
    >>> DepRound(weights_p, k)
    [0, 4]
    >>> DepRound(weights_p, k)
    [0, 4]
    >>> DepRound(weights_p, k)
    [0, 1]

    - See [[Gandhi et al, 2006](http://dl.acm.org/citation.cfm?id=1147956)] for the details.
    """
    p = np.array(weights_p)
    K = len(p)
    # Checks
    assert k < K, "Error: k = {} should be < K = {}.".format(k, K)  # DEBUG
    if not np.isclose(np.sum(p), 1):
        p = p / np.sum(p)
    assert np.all(0 <= p) and np.all(p <= 1), "Error: the weights (p_1, ..., p_K) should all be 0 <= p_i <= 1 ...".format(p)  # DEBUG
    assert np.isclose(np.sum(p), 1), "Error: the sum of weights p_1 + ... + p_K should be = 1 (= {}).".format(np.sum(p))  # DEBUG
    # Main loop
    possible_ij = [a for a in range(K) if 0 < p[a] < 1]
    while possible_ij:
        # Choose distinct i, j with 0 < p_i, p_j < 1
        if len(possible_ij) == 1:
            i = np.random.choice(possible_ij, size=1)
            j = i
        else:
            i, j = np.random.choice(possible_ij, size=2, replace=False)
        pi, pj = p[i], p[j]
        assert 0 < pi < 1, "Error: pi = {} (with i = {}) is not 0 < pi < 1.".format(pi, i)  # DEBUG
        assert 0 < pj < 1, "Error: pj = {} (with j = {}) is not 0 < pj < 1.".format(pj, i)  # DEBUG
        assert i != j, "Error: i = {} is different than with j = {}.".format(i, j)  # DEBUG

        # Set alpha, beta
        alpha, beta = min(1 - pi, pj), min(pi, 1 - pj)
        proba = alpha / (alpha + beta)
        if with_proba(proba):  # with probability = proba = alpha/(alpha+beta)
            pi, pj = pi + alpha, pj - alpha
        else:            # with probability = 1 - proba = beta/(alpha+beta)
            pi, pj = pi - beta, pj + beta

        # Store
        p[i], p[j] = pi, pj
        # And update
        possible_ij = [a for a in range(K) if 0 < p[a] < 1]
        if len([a for a in range(K) if np.isclose(p[a], 0)]) == K - k:
            break
    # Final step
    subset = [a for a in range(K) if np.isclose(p[a], 1)]
    if len(subset) < k:
        subset = [a for a in range(K) if not np.isclose(p[a], 0)]
    assert len(subset) == k, "Error: DepRound({}, {}) is supposed to return a set of size {}, but {} has size {}...".format(weights_p, k, k, subset, len(subset))  # DEBUG
    return subset
Beispiel #15
0
 def choiceWithRank(self, rank=1):
     r""" With a probability :math:`\alpha`, play uniformly at random, otherwise, pass the call to :meth:`choiceWithRank` of the underlying policy."""
     if with_proba(self.proba_random_exploration):
         return np.random.randint(0, self.nbArms - 1)
     return self.policy.choiceWithRank(rank=1)
Beispiel #16
0
 def choice(self):
     """ Choose an arm following the different phase of growing lengths according to the AdSwitch algorithm."""
     # print("For a {} policy: t = {}, current_exploration_arm = {}, current_exploitation_arm = {}, batch_number = {}, length_of_current_phase = {}, step_of_current_phase = {}".format(self, self.t, self.current_exploration_arm, self.current_exploitation_arm, self.batch_number, self.length_of_current_phase, self.step_of_current_phase))  # DEBUG
     # 1) exploration
     # --------------
     if self.phase == Phase.Estimation:
         # beginning of exploration phase
         if self.current_exploration_arm is None:
             self.current_exploration_arm = -1
         # Round-Robin phase
         self.current_exploration_arm = (self.current_exploration_arm + 1) % self.nbArms  # go for next arm
         # Test!
         saw_a_change, sigma = self.statistical_test(self.t, self.last_restart_time)
         if saw_a_change:
             mus = [ mymean(self.read_range_of_rewards(a, sigma, self.t)) for a in range(self.nbArms) ]
             self.current_best_arm = np.argmax(mus)
             self.current_worst_arm = np.argmin(mus)
             self.current_estimated_gap = abs(mus[self.current_best_arm] - mus[self.current_worst_arm])
             self.last_restart_time = self.t
             # change of phase
             self.length_of_current_phase = None  # flag to start the next one
             self.phase = Phase.Exploitation
             self.step_of_current_phase = 0
             self.current_exploration_arm = 0
             # note that this last update might force to sample the arm 0 instead of arm K-1, once in a while...
         return self.current_exploration_arm
     # 2) exploitation
     # ---------------
     elif self.phase == Phase.Exploitation:
         # if in a phase, do it
         if self.length_of_current_phase is not None and self.step_of_current_phase < self.length_of_current_phase:
             # beginning of exploration phase
             if self.current_exploitation_arm is None:
                 self.current_exploitation_arm = -1
             # Round-Robin phase
             if self.current_exploitation_arm >= self.nbArms:
                 self.step_of_current_phase += 1  # explore each arm, ONE more time!
             self.current_exploitation_arm = (self.current_exploitation_arm + 1) % self.nbArms  # go for next arm
         else:
             if self.current_exploitation_arm is None:
                 self.current_exploitation_arm = 0
             # test for a change of size d_i
             compute_new_di_pi_si = self.last_used_di_pi_si is None
             if not compute_new_di_pi_si:
                 di, pi, si = self.last_used_di_pi_si
                 t1 = self.last_restart_time
                 t2 = self.t + 1
                 mus = [ np.mean(self.read_range_of_rewards(a, t1, t2)) for a in range(self.nbArms) ]
                 current_best_mean = np.max(mus)
                 current_worst_mean = np.min(mus)
                 print("Info: the test |mu_a[t1,t2] - mu_b[t1,t2] - Delta| > di/4 is {} for a = best = {}, b = worst = {}, t1 = {} and t2 = {}, Delta = {} and di = {}...".format(abs(current_best_mean - current_worst_mean - self.current_estimated_gap) > di / 4, np.argmax(mus), np.argmin(mus), t1, t2, self.current_estimated_gap, di))  # DEBUG
                 if abs(current_best_mean - current_worst_mean - self.current_estimated_gap) > di / 4:
                     print("Info: the test |mu_a[t1,t2] - mu_b[t1,t2] - Delta| > di/4 was true for a = best = {}, b = worst = {}, t1 = {} and t2 = {}, Delta = {} and di = {}...".format(np.argmax(mus), np.argmin(mus), t1, t2, self.current_estimated_gap, di))  # DEBUG
                     # go back to Estimation phase
                     self.phase = Phase.Estimation
                     self.length_of_current_phase = None  # flag to start the next one
                     self.step_of_current_phase = 0
                     self.current_exploration_arm = 0
                     self.batch_number += 1
                 else:
                     compute_new_di_pi_si = True
             if compute_new_di_pi_si:
                 di_values, pi_values, si_values = self.compute_di_pi_si()
                 proba_of_checking = np.sum(pi_values)
                 assert 0 <= proba_of_checking < 1, "Error: the sum of pi should be < 1 but it is = {}, impossible to do a Step 5 of Exploitation!".format(proba_of_checking)
                 if proba_of_checking > 0:
                     for di, pi, si in zip(di_values, pi_values, si_values):
                         if with_proba(pi):
                             # Start a checking phase!
                             self.last_used_di_pi_si = (di, pi, si)
                             self.length_of_current_phase = si
                             break
                             # ---
                             # DONE OK I understood correctly this sentence, my implementation is correct!
                             # Then for any i from {1, 2,..., Ik} with probability p_k,i, sample both arms alternatingly for si steps
                             # ---
                             # this will make the test sample each arm alternatingly for s_i steps to check for changes of size d_i
             # if no checking is performed at current time step t, then select best arm, and repeat checking phase.
         return self.current_exploitation_arm
     else:
         raise ValueError("Error: AdSwitch should only be in phase Exploration or Checking or Exploitation.")