class CorralLearner(Learner): """A meta-learner that takes a collection of learners and determines which is best in an environment. This is an implementation of the Agarwal et al. (2017) Corral algorithm and requires that the reward is always in [0,1]. References: Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. "Corralling a band of bandit algorithms." In Conference on Learning Theory, pp. 12-38. PMLR, 2017. """ def __init__(self, learners: Sequence[Learner], eta : float = 0.075, T : float = math.inf, mode : Literal["importance","rejection","off-policy"] ="importance", seed : int = 1) -> None: """Instantiate a CorralLearner. Args: learners: The collection of base learners. eta: The learning rate. This controls how quickly Corral picks a best base_learner. T: The number of interactions expected during the learning process. A small T will cause the learning rate to shrink towards 0 quickly while a large value for T will cause the learning rate to shrink towards 0 slowly. A value of inf means that the learning rate will remain constant. mode: Determines the method with which feedback is provided to the base learners. The original paper used importance sampling. We also support `off-policy` and `rejection`. seed: A seed for a random number generation in ordre to get repeatable results. """ if mode not in ["importance", "off-policy", "rejection"]: raise CobaException("The provided `mode` for CorralLearner was unrecognized.") self._base_learners = [ SafeLearner(learner) for learner in learners] M = len(self._base_learners) self._T = T self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._mode = mode self._random_pick = CobaRandom(seed) self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000)) @property def params(self) -> Dict[str, Any]: return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed } def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]: base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ] base_predicts, base_infos = zip(*base_predicts) if self._mode in ["importance"]: base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts ] base_probs = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ] predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ] info = (base_actions, base_probs, base_infos, base_predicts, actions, predict) if self._mode in ["off-policy", "rejection"]: predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ] info = (None, None, base_infos, base_predicts, actions, predict) return (predict, info) def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None: assert 0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1" base_actions = info[0] base_probs = info[1] base_infos = info[2] base_preds = info[3] actions = info[4] predict = info[5] if self._mode == "importance": # This is what is in the original paper. It has the following characteristics: # > It is able to provide feedback to every base learner on every iteration # > It uses a reward estimator with higher variance and no bias (aka, importance sampling) # > It is "on-policy" with respect to base learner's prediction distributions # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A] for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos): R = reward * int(A==action)/probability learner.learn(context, A, R, P, base_info) if self._mode == "off-policy": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It is able to provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) # > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts). for learner, base_info in zip(self._base_learners, base_infos): learner.learn(context, action, reward, probability, base_info) if self._mode == "rejection": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It doesn't necessarily provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback # > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts). p = self._random_reject.random() #can I reuse this across all learners like this??? I think so??? for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds): f = lambda a: base_predict[actions.index(a)] #the PMF we want g = lambda a: predict[actions.index(a)] #the PMF we have M = max([f(A)/g(A) for A in actions if g(A) > 0]) if p <= f(action)/(M*g(action)): learner.learn(context, action, reward, f(action), base_info) # Instant loss is an unbiased estimate of E[loss|learner] for this iteration. # Our estimate differs from the orginal Corral paper because we have access to the # action probabilities of the base learners while the Corral paper did not assume # access to this information. This information allows for a loss esimator with the same # expectation as the original Corral paper's estimator but with a lower variance. loss = 1-reward picked_index = actions.index(action) instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ] self._ps = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas) self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ] for i in range(len(self._base_learners)): if 1/self._p_bars[i] > self._rhos[i]: self._rhos[i] = 2/self._p_bars[i] self._etas[i] *= self._beta base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) } base_pbar_data = { f"pbar_{i}" : self._p_bars[i] for i in range(len(self._base_learners)) } predict_data = { "predict" : probability, **base_predict_data, **base_pbar_data } InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data}) @staticmethod def _log_barrier_omd(ps, losses, etas) -> Sequence[float]: f = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)])) df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)])) denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ] min_loss = min(losses) max_loss = max(losses) precision = 4 def binary_search(l,r) -> Optional[float]: #in theory the above check should guarantee this has a solution while True: x = (l+r)/2 y = f(x) if round(y,precision) == 1: return x if y < 1: l = x if y > 1: r = x def find_root_of_1(): brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss])))) for l_brack, r_brack in zip(brackets[:-1], brackets[1:]): if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0: continue else: # we use binary search because newtons # method can overshoot our objective return binary_search(l_brack, r_brack) lmbda: Optional[float] = None if min_loss == max_loss: lmbda = min_loss elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1: lmbda = min_loss elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1: lmbda = max_loss else: lmbda = find_root_of_1() if lmbda is None: raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}') new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)] assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral" return new_ps
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, n_neighborhoods: int = 10, seed: int = 1) -> None: """Instantiate a NeighborsSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. n_neighborhoods: The number of neighborhoods the simulation should have. seed: The random number seed used to generate all contexts and action rewards. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, n_neighborhoods, seed) self._n_interactions = n_interactions self._n_actions = n_actions self._n_context_feats = n_context_features self._n_action_feats = n_action_features self._n_neighborhoods = n_neighborhoods self._seed = seed rng = CobaRandom(self._seed) def context_gen(): return tuple(rng.gausses(n_context_features, 0, 1)) if n_context_features else None def actions_gen(): if not n_action_features: return OneHotEncoder().fit_encodes(range(n_actions)) else: return [ tuple(rng.gausses(n_action_features, 0, 1)) for _ in range(n_actions) ] contexts = list( set([context_gen() for _ in range(self._n_neighborhoods)])) context_actions = {c: actions_gen() for c in contexts} context_action_rewards = {(c, a): rng.random() for c in contexts for a in context_actions[c]} context_iter = iter(islice(cycle(contexts), n_interactions)) def context(index: int): return next(context_iter) def actions(index: int, context: Tuple[float, ...]): return context_actions[context] def reward(index: int, context: Tuple[float, ...], action: Tuple[int, ...]): return context_action_rewards[(context, action)] return super().__init__(self._n_interactions, context, actions, reward)
def __init__(self, n_interactions: int = 500, n_actions: int = 10, n_features: int = 10, context_features: bool = True, action_features: bool = True, sparse: bool = False, seed: int = 1) -> None: self._n_bandits = n_actions self._n_features = n_features self._context_features = context_features self._action_features = action_features self._seed = seed r = CobaRandom(seed) context: Callable[[int], Context] actions: Callable[[int, Context], Sequence[Action]] rewards: Callable[[int, Context, Action], float] sparsify = lambda x: (tuple(range(len(x))), tuple(x) ) if sparse else tuple(x) unsparse = lambda x: x[1] if sparse else x normalize = lambda X: [x / sum(X) for x in X] if not context_features and not action_features: means = [ m / n_actions + 1 / (2 * n_actions) for m in r.randoms(n_actions) ] actions_features = [] for i in range(n_actions): action = [0] * n_actions action[i] = 1 actions_features.append(tuple(action)) context = lambda i: None actions = lambda i, c: sparsify(actions_features) rewards = lambda i, c, a: means[unsparse(a).index(1)] + (r.random( ) - .5) / n_actions if context_features and not action_features: #normalizing allows us to make sure our reward is in [0,1] bandit_thetas = [r.randoms(n_features) for _ in range(n_actions)] theta_totals = [sum(theta) for theta in bandit_thetas] bandit_thetas = [[ t / norm for t in theta ] for theta, norm in zip(bandit_thetas, theta_totals)] actions_features = [] for i in range(n_actions): action = [0] * n_actions action[i] = 1 actions_features.append(tuple(action)) context = lambda i: sparsify(r.randoms(n_features)) actions = lambda i, c: [sparsify(af) for af in actions_features] rewards = lambda i, c, a: sum([ cc * t for cc, t in zip(unsparse(c), bandit_thetas[unsparse(a). index(1)]) ]) if not context_features and action_features: theta = r.randoms(n_features) context = lambda i: None actions = lambda i, c: [ sparsify(normalize(r.randoms(n_features))) for _ in range(r.randint(2, 10)) ] rewards = lambda i, c, a: float( sum([cc * t for cc, t in zip(theta, unsparse(a))])) if context_features and action_features: context = lambda i: sparsify(r.randoms(n_features)) actions = lambda i, c: [ sparsify(normalize(r.randoms(n_features))) for _ in range(r.randint(2, 10)) ] rewards = lambda i, c, a: sum( [cc * t for cc, t in zip(unsparse(c), unsparse(a))]) super().__init__(n_interactions, context, actions, rewards)