def choiceWithRank(self, rank=1): r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm of a certain rank.""" if rank == 1: return self.choice() else: assert rank >= 1, "Error: for AdBandits = {}, in choiceWithRank(rank={}) rank has to be >= 1.".format( self, rank) # Thompson Exploration if with_proba(1 - self.epsilon): # with proba 1-epsilon indexes = [ self.posterior[i].sample() for i in range(self.nbArms) ] # UCB-Bayes else: expectations = (1.0 + self.rewards) / (2.0 + self.pulls) upperbounds = [ self.posterior[arm].quantile(1. - 1. / self.t) for arm in range(self.nbArms) ] indexes = expectations - np.max(upperbounds) # We computed the indexes, OK let's use them sortedRewards = np.sort( indexes ) # XXX What happens here if two arms has the same index, being the max? chosenIndex = sortedRewards[-rank] # Uniform choice among the rank-th best arms return choice(np.nonzero(indexes == chosenIndex)[0])
def choice(self): """ Choose an arm, as described by the MEGA algorithm.""" self.t += 1 if self.chosenArm is not None: # We can still exploit that arm return self.chosenArm else: # We have to chose a new arm # Identify available arms availableArms = np.nonzero(self.tnext <= self.t)[0] if len(availableArms) == 0: print("Error: MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET ... Choosing a random arm.") # DEBUG self.chosenArm = rn.randint(self.nbArms) # XXX Choose a random arm # raise ValueError("FIXME MEGA.choice() should 'Refrain from transmitting in this round' but my model does not allow this - YET") else: # There is some available arms epsilon = self._epsilon_t() if with_proba(epsilon): # With proba epsilon_t newArm = rn.choice(availableArms) # Explore valid arms if self.chosenArm != newArm: self.p = self.p0 # Reinitialize proba p else: # Exploit: select the arm with highest meanRewards self.meanRewards[self.pulls != 0] = self.rewards[self.pulls != 0] / self.pulls[self.pulls != 0] # newArm = np.argmax(self.meanRewards) # Uniformly chosen if more than one arm has the highest index, but that's unlikely newArm = np.random.choice(np.nonzero(self.meanRewards == np.max(self.meanRewards))[0]) self.chosenArm = newArm return self.chosenArm
def getReward(self, arm, reward): """ Give reward for each child, and then update the trust probabilities.""" reward = float(reward) new_reward = renormalize_reward(reward, lower=self.lower, amplitude=self.amplitude, unbiased=False) # print(" A LearnExp player {} received a reward = {:.3g} on arm {} and trust = {:.3g} on that choice = {}, giving {:.3g} ...".format(self, reward, arm, self.trusts[self.last_choice], self.last_choice, new_reward)) # DEBUG # 1. First, give rewards to that slave, with probability rate / trusts probability = self.rate / self.trusts[self.last_choice] assert 0 <= probability <= 1, "Error: 'probability' = {:.3g} = rate = {:.3g} / trust_j^t = {:.3g} should have been in [0, 1]...".format(probability, self.rate, self.trusts[self.last_choice]) # DEBUG if with_proba(probability): self.children[self.last_choice].getReward(arm, reward) # 2. Then reinitialize this array of losses assert 0 <= new_reward <= 1, "Error: the normalized reward {:.3g} was NOT in [0, 1] ...".format(new_reward) # DEBUG loss = (1 - new_reward) if self.unbiased: loss /= self.trusts[self.last_choice] # 3. Update weight of that slave self.weights[self.last_choice] *= np.exp(- self.rate * loss) # 4. Recomputed the trusts from the weights # add uniform mixing of proportion rate=eta/N self.trusts = (1 - self.eta) * (self.weights / np.sum(self.weights)) + self.rate # self.trusts = trusts / np.sum(trusts) # XXX maybe this isn't necessary... # print(" The most trusted child policy is the {}th with confidence {}...".format(1 + np.argmax(self.trusts), np.max(self.trusts))) # DEBUG assert np.isclose(np.sum(self.trusts), 1), "Error: 'trusts' do not sum to 1 but to {:.3g} instead...".format(np.sum(self.trusts)) # DEBUG
def choice(self): """With a probability of epsilon, explore (uniform choice), otherwhise exploit based on just accumulated *rewards* (not empirical mean rewards).""" if with_proba(self.epsilon): # Proba epsilon : explore return rn.randint(0, self.nbArms) else: # Proba 1 - epsilon : exploit # Uniform choice among the best arms biased_means = self.rewards / (1 + self.pulls) return np.random.choice(np.nonzero(biased_means == np.max(biased_means))[0])
def choice(self): """ Choose an arm following the different phase of growing lengths according to the AdSwitchNew algorithm.""" # 1. Add checks for bad arms: for bad_arm in self.set_BAD: gap_Delta_hat_of_l_a = self.gap_Delta_tilde_of_l[bad_arm] for i in range(1, self.find_max_i(gap_Delta_hat_of_l_a) + 1): # assert 2**(-i) >= gap_Delta_hat_of_l_a/16 # DEBUG # ell, K, T = self.ell, self.nbArms, self.horizon probability_to_add_this_triplet = 2**(-i) * np.sqrt( self.ell / (self.nbArms * self.horizon * np.log(self.horizon))) print( "AdSwitchNew: for bad_arm = {}, gap Delta = {}, and i = {}, a new triplet can be added to the set S with probability = {}." .format(bad_arm, gap_Delta_hat_of_l_a, i, probability_to_add_this_triplet)) # DEBUG if with_proba(probability_to_add_this_triplet): triplet = (2**(-i), np.floor(2**(2 * i + 1) * np.log(self.horizon)), self.t) print( "\nAdSwitchNew: for bad_arm = {}, gap Delta = {}, and i = {}, the triplet = {} was added to the set S with probability = {}." .format(bad_arm, gap_Delta_hat_of_l_a, i, triplet, probability_to_add_this_triplet)) # DEBUG self.set_S[bad_arm].add(triplet) print(" self.set_S[bad_arm] =", self.set_S[bad_arm]) # DEBUG # 2. Select an arm: these_times_taus = [float('+inf') for arm in range(self.nbArms)] for arm in self.set_GOOD | { a for a in range(self.nbArms) if self.set_S[a] }: print( "AdSwitchNew: for arm = {}, in GOOD_(t) = {} or with set S_t(a) = {} not empty, at time t = {}." .format(arm, self.set_GOOD, self.set_S[arm], self.t)) # DEBUG look_ahead_in_past = 1 while look_ahead_in_past < len( self.history_of_plays ) and self.history_of_plays[-look_ahead_in_past] != arm: look_ahead_in_past += 1 these_times_taus[arm] = self.t - look_ahead_in_past print( "\nAdSwitchNew: for arm = {}, this time tau = {}, and t = {}, look ahead in past (t - min t') = {}." .format(arm, these_times_taus[arm], self.t, look_ahead_in_past)) # DEBUG chosen_arm = np.argmin(these_times_taus) self.history_of_plays.append(chosen_arm) if not np.all(np.isinf(these_times_taus)): print( "AdSwitchNew: for time t = {}, choosing {} = arg min {} non all = +inf, adding to history of plays..." .format(self.t, chosen_arm, these_times_taus)) # DEBUG return chosen_arm
def choiceMultiple(self, nb=1): if nb == 1: return np.array([self.choice()]) else: # FIXME the explore/exploit balance should be for each choice, right? if with_proba(self.epsilon): # Proba epsilon : Explore return rn.choice(self.nbArms, size=nb, replace=False) else: # Proba 1 - epsilon : exploit sortedRewards = np.sort(self.rewards) # Uniform choice among the best arms return rn.choice(np.nonzero(self.rewards >= sortedRewards[-nb])[0], size=nb, replace=False)
def choiceWithRank(self, rank=1): """With a probability of epsilon, explore (uniform choice), otherwhise exploit with the rank, based on just accumulated *rewards* (not empirical mean rewards).""" if rank == 1: return self.choice() else: if with_proba(self.epsilon): # Proba epsilon : explore return rn.randint(0, self.nbArms - 1) else: # Proba 1 - epsilon : exploit sortedRewards = np.sort(self.rewards) chosenIndex = sortedRewards[-rank] # Uniform choice among the rank-th best arms return rn.choice(np.nonzero(self.rewards == chosenIndex)[0])
def handleCollision(self, arm, reward=None): """ Get a new fully random rank, and give reward to the algorithm if not None.""" # rhoRandALOHA UCB indexes learn on the SENSING, not on the successful transmissions! if reward is not None: # print("Info: rhoRandALOHA UCB internal indexes DOES get updated by reward, in case of collision, learning is done on SENSING, not successful transmissions!") # DEBUG super(oneRhoRandALOHA, self).getReward(arm, reward) # 1. With probability 1-p, change the rank if with_proba(1. - self.p): self.rank = new_rank(self.rank, self.maxRank, self.forceChange) # New random rank, can be forced to be new or not. # print(" - A oneRhoRandALOHA player {} saw a collision, so she had to select a new random rank : {} ...".format(self, self.rank)) # DEBUG # print(" - A oneRhoRandALOHA player {} saw a collision, so she reinitialized her probability p from {:.5g} to {:.5g}...".format(self, self.p, self.p0)) # DEBUG self.p = self.p0 # Reinitialize the proba p
def bernoulliBinarization(r_t): """ Return a (random) binarization of a reward :math:`r_t`, in the continuous interval :math:`[0, 1]` as an observation in discrete :math:`{0, 1}`. - Useful to allow to use a Beta posterior for non-Bernoulli experiments, - That way, :class:`Thompson` sampling can be used for any continuous-valued bounded rewards. """ if r_t == 0: return 0 # Returns a int! elif r_t == 1: return 1 # Returns a int! else: assert 0 <= r_t <= 1, "Error: only bounded rewards in [0, 1] are supported by this Beta posterior right now." return int(with_proba(r_t))
def choiceFromSubSet(self, availableArms='all'): if (availableArms == 'all') or (len(availableArms) == self.nbArms): return self.choice() elif len(availableArms) == 0: print("WARNING: EpsilonGreedy.choiceFromSubSet({}): the argument availableArms of type {} should not be empty.".format(availableArms, type(availableArms))) # DEBUG # WARNING if no arms are tagged as available, what to do ? choose an arm at random, or call choice() as if available == 'all' return self.choice() # return np.random.randint(self.nbArms) else: if with_proba(self.epsilon): # Proba epsilon : explore return rn.choice(availableArms) else: # Proba 1 - epsilon : exploit # Uniform choice among the best arms return rn.choice(np.nonzero(self.rewards[availableArms] == np.max(self.rewards[availableArms]))[0])
def handleCollision(self, arm, reward=None): """ Handle a collision, on arm of index 'arm'. .. warning:: This method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers. .. note:: We do not care on which arm the collision occured. """ # print(" ---------> A oneALOHA player saw a collision on arm {}, at time t = {} ... Currently, p = {} ...".format(arm, self.t, self.p)) # DEBUG # self.getReward(arm, self.mother.lower) # FIXED should we give a 0 reward ? Not in this model! # 1. With proba 1 - p, give up if with_proba(1 - self.p): # Random time offset until when this arm self.chosenArm is not sampled delta_tnext_k = rn.randint(low=0, high=1 + int(self.ftnext(self.t))) self.tnext[self.chosenArm] = self.t + 1 + delta_tnext_k # print(" - Reaction to collision on arm {}, at time t = {} : delta_tnext_k = {}, tnext[{}] = {} ...".format(arm, self.t, delta_tnext_k, self.chosenArm, self.tnext[self.chosenArm])) # DEBUG self.p = self.p0 # Reinitialize the proba p self.chosenArm = None # We give up this arm
def handleCollision(self, arm, reward=None): """ Handle a collision, on arm of index 'arm'. - Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers. .. note:: We do not care on which arm the collision occured. """ assert self.chosenArm == arm, "Error: a MEGA player can only see a collision on her chosenArm. Here, arm = {} != chosenArm = {} ...".format(arm, self.chosenArm) # DEBUG # print("- A MEGA player saw a collision on arm {}, and time t = {} ...".format(arm, self.t)) # DEBUG # # 1. With proba p, persist XXX useless code # # if with_proba(self.p): # # self.chosenArm = self.chosenArm # 2. With proba 1 - p, give up if with_proba(1 - self.p): # Random time offset until when this arm self.chosenArm is not sampled delta_tnext_k = rn.randint(low=0, high=1 + int(self.t**self.beta)) self.tnext[self.chosenArm] = self.t + delta_tnext_k # Reinitialize the proba p self.p = self.p0 # We give up this arm self.chosenArm = None
def choice(self): r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm.""" # Thompson Exploration if with_proba(1 - self.epsilon): # with proba 1-epsilon upperbounds = [ self.posterior[i].sample() for i in range(self.nbArms) ] maxIndex = max(upperbounds) bestArms = [ arm for (arm, index) in enumerate(upperbounds) if index == maxIndex ] arm = choice(bestArms) # UCB-Bayes else: expectations = (1.0 + self.rewards) / (2.0 + self.pulls) upperbounds = [ self.posterior[arm].quantile(1. - 1. / self.t) for arm in range(self.nbArms) ] regret = np.max(upperbounds) - expectations admissible = np.nonzero(regret == np.min(regret))[0] arm = choice(admissible) return arm
def DepRound(weights_p, k=1): r""" [[Algorithms for adversarial bandit problems with multiple plays, by T.Uchiya, A.Nakamura and M.Kudo, 2010](http://hdl.handle.net/2115/47057)] Figure 5 (page 15) is a very clean presentation of the algorithm. - Inputs: :math:`k < K` and weights_p :math:`= (p_1, \dots, p_K)` such that :math:`\sum_{i=1}^{K} p_i = k` (or :math:`= 1`). - Output: A subset of :math:`\{1,\dots,K\}` with exactly :math:`k` elements. Each action :math:`i` is selected with probability exactly :math:`p_i`. Example: >>> import numpy as np; import random >>> np.random.seed(0); random.seed(0) # for reproductibility! >>> K = 5 >>> k = 2 >>> weights_p = [ 2, 2, 2, 2, 2 ] # all equal weights >>> DepRound(weights_p, k) [3, 4] >>> DepRound(weights_p, k) [3, 4] >>> DepRound(weights_p, k) [0, 1] >>> weights_p = [ 10, 8, 6, 4, 2 ] # decreasing weights >>> DepRound(weights_p, k) [0, 4] >>> DepRound(weights_p, k) [1, 2] >>> DepRound(weights_p, k) [3, 4] >>> weights_p = [ 3, 3, 0, 0, 3 ] # decreasing weights >>> DepRound(weights_p, k) [0, 4] >>> DepRound(weights_p, k) [0, 4] >>> DepRound(weights_p, k) [0, 4] >>> DepRound(weights_p, k) [0, 1] - See [[Gandhi et al, 2006](http://dl.acm.org/citation.cfm?id=1147956)] for the details. """ p = np.array(weights_p) K = len(p) # Checks assert k < K, "Error: k = {} should be < K = {}.".format(k, K) # DEBUG if not np.isclose(np.sum(p), 1): p = p / np.sum(p) assert np.all(0 <= p) and np.all(p <= 1), "Error: the weights (p_1, ..., p_K) should all be 0 <= p_i <= 1 ...".format(p) # DEBUG assert np.isclose(np.sum(p), 1), "Error: the sum of weights p_1 + ... + p_K should be = 1 (= {}).".format(np.sum(p)) # DEBUG # Main loop possible_ij = [a for a in range(K) if 0 < p[a] < 1] while possible_ij: # Choose distinct i, j with 0 < p_i, p_j < 1 if len(possible_ij) == 1: i = np.random.choice(possible_ij, size=1) j = i else: i, j = np.random.choice(possible_ij, size=2, replace=False) pi, pj = p[i], p[j] assert 0 < pi < 1, "Error: pi = {} (with i = {}) is not 0 < pi < 1.".format(pi, i) # DEBUG assert 0 < pj < 1, "Error: pj = {} (with j = {}) is not 0 < pj < 1.".format(pj, i) # DEBUG assert i != j, "Error: i = {} is different than with j = {}.".format(i, j) # DEBUG # Set alpha, beta alpha, beta = min(1 - pi, pj), min(pi, 1 - pj) proba = alpha / (alpha + beta) if with_proba(proba): # with probability = proba = alpha/(alpha+beta) pi, pj = pi + alpha, pj - alpha else: # with probability = 1 - proba = beta/(alpha+beta) pi, pj = pi - beta, pj + beta # Store p[i], p[j] = pi, pj # And update possible_ij = [a for a in range(K) if 0 < p[a] < 1] if len([a for a in range(K) if np.isclose(p[a], 0)]) == K - k: break # Final step subset = [a for a in range(K) if np.isclose(p[a], 1)] if len(subset) < k: subset = [a for a in range(K) if not np.isclose(p[a], 0)] assert len(subset) == k, "Error: DepRound({}, {}) is supposed to return a set of size {}, but {} has size {}...".format(weights_p, k, k, subset, len(subset)) # DEBUG return subset
def choiceWithRank(self, rank=1): r""" With a probability :math:`\alpha`, play uniformly at random, otherwise, pass the call to :meth:`choiceWithRank` of the underlying policy.""" if with_proba(self.proba_random_exploration): return np.random.randint(0, self.nbArms - 1) return self.policy.choiceWithRank(rank=1)
def choice(self): """ Choose an arm following the different phase of growing lengths according to the AdSwitch algorithm.""" # print("For a {} policy: t = {}, current_exploration_arm = {}, current_exploitation_arm = {}, batch_number = {}, length_of_current_phase = {}, step_of_current_phase = {}".format(self, self.t, self.current_exploration_arm, self.current_exploitation_arm, self.batch_number, self.length_of_current_phase, self.step_of_current_phase)) # DEBUG # 1) exploration # -------------- if self.phase == Phase.Estimation: # beginning of exploration phase if self.current_exploration_arm is None: self.current_exploration_arm = -1 # Round-Robin phase self.current_exploration_arm = (self.current_exploration_arm + 1) % self.nbArms # go for next arm # Test! saw_a_change, sigma = self.statistical_test(self.t, self.last_restart_time) if saw_a_change: mus = [ mymean(self.read_range_of_rewards(a, sigma, self.t)) for a in range(self.nbArms) ] self.current_best_arm = np.argmax(mus) self.current_worst_arm = np.argmin(mus) self.current_estimated_gap = abs(mus[self.current_best_arm] - mus[self.current_worst_arm]) self.last_restart_time = self.t # change of phase self.length_of_current_phase = None # flag to start the next one self.phase = Phase.Exploitation self.step_of_current_phase = 0 self.current_exploration_arm = 0 # note that this last update might force to sample the arm 0 instead of arm K-1, once in a while... return self.current_exploration_arm # 2) exploitation # --------------- elif self.phase == Phase.Exploitation: # if in a phase, do it if self.length_of_current_phase is not None and self.step_of_current_phase < self.length_of_current_phase: # beginning of exploration phase if self.current_exploitation_arm is None: self.current_exploitation_arm = -1 # Round-Robin phase if self.current_exploitation_arm >= self.nbArms: self.step_of_current_phase += 1 # explore each arm, ONE more time! self.current_exploitation_arm = (self.current_exploitation_arm + 1) % self.nbArms # go for next arm else: if self.current_exploitation_arm is None: self.current_exploitation_arm = 0 # test for a change of size d_i compute_new_di_pi_si = self.last_used_di_pi_si is None if not compute_new_di_pi_si: di, pi, si = self.last_used_di_pi_si t1 = self.last_restart_time t2 = self.t + 1 mus = [ np.mean(self.read_range_of_rewards(a, t1, t2)) for a in range(self.nbArms) ] current_best_mean = np.max(mus) current_worst_mean = np.min(mus) print("Info: the test |mu_a[t1,t2] - mu_b[t1,t2] - Delta| > di/4 is {} for a = best = {}, b = worst = {}, t1 = {} and t2 = {}, Delta = {} and di = {}...".format(abs(current_best_mean - current_worst_mean - self.current_estimated_gap) > di / 4, np.argmax(mus), np.argmin(mus), t1, t2, self.current_estimated_gap, di)) # DEBUG if abs(current_best_mean - current_worst_mean - self.current_estimated_gap) > di / 4: print("Info: the test |mu_a[t1,t2] - mu_b[t1,t2] - Delta| > di/4 was true for a = best = {}, b = worst = {}, t1 = {} and t2 = {}, Delta = {} and di = {}...".format(np.argmax(mus), np.argmin(mus), t1, t2, self.current_estimated_gap, di)) # DEBUG # go back to Estimation phase self.phase = Phase.Estimation self.length_of_current_phase = None # flag to start the next one self.step_of_current_phase = 0 self.current_exploration_arm = 0 self.batch_number += 1 else: compute_new_di_pi_si = True if compute_new_di_pi_si: di_values, pi_values, si_values = self.compute_di_pi_si() proba_of_checking = np.sum(pi_values) assert 0 <= proba_of_checking < 1, "Error: the sum of pi should be < 1 but it is = {}, impossible to do a Step 5 of Exploitation!".format(proba_of_checking) if proba_of_checking > 0: for di, pi, si in zip(di_values, pi_values, si_values): if with_proba(pi): # Start a checking phase! self.last_used_di_pi_si = (di, pi, si) self.length_of_current_phase = si break # --- # DONE OK I understood correctly this sentence, my implementation is correct! # Then for any i from {1, 2,..., Ik} with probability p_k,i, sample both arms alternatingly for si steps # --- # this will make the test sample each arm alternatingly for s_i steps to check for changes of size d_i # if no checking is performed at current time step t, then select best arm, and repeat checking phase. return self.current_exploitation_arm else: raise ValueError("Error: AdSwitch should only be in phase Exploration or Checking or Exploitation.")