def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None: """Instantiate a CorralLearner. Args: base_learners: The collection of algorithms to use as base learners. eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best. T: The number of interactions expected during the learning process. In our experiments Corral performance seemed relatively insensitive to this value. seed: A seed for a random number generation in ordre to get repeatable results. """ self._base_learners = base_learners M = len(self._base_learners) self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._random = CobaRandom(seed) self._base_action_picks : Dict[Key, Sequence[Action]] = {} self._base_action_probs: Dict[Key, Sequence[float]] = {}
def filter(self, items: Iterable[Any]) -> Sequence[Any]: rng = CobaRandom(self._seed) if self._max_count == 0: return [] if self._max_count == None: return Take(self._count).filter(rng.shuffle(list(items))) W = 1 items = iter(items) reservoir = rng.shuffle(list(islice(items, self._max_count))) try: while True: [r1, r2, r3] = rng.randoms(3) W = W * math.exp(math.log(r1) / (self._max_count or 1)) S = math.floor(math.log(r2) / math.log(1 - W)) reservoir[int(r3 * self._max_count - .001)] = next( islice(items, S, S + 1)) except StopIteration: pass return Take(self._count).filter(reservoir)
def filter(self, interactions: Iterable[Interaction]) -> Iterable[Interaction]: rng = CobaRandom(self._seed) interactions = list(interactions) for i in range(int(len(interactions) / (self._spacing + 1))): interactions.insert( i * self._spacing + rng.randint(0, self._spacing), interactions.pop()) return interactions
def filter( self, interactions: Iterable[SimulatedInteraction] ) -> Iterable[Interaction]: self._rng = CobaRandom(self._seed) underlying_iterable = iter(interactions) logged_interactions = map(self._to_logged_interaction, islice(underlying_iterable, self._n_warm)) simulated_interactions = underlying_iterable return chain(logged_interactions, simulated_interactions)
def filter( self, interactions: Iterable[SimulatedInteraction] ) -> Iterable[SimulatedInteraction]: rng = CobaRandom(self._seed) for interaction in interactions: if isinstance(interaction, LoggedInteraction): raise CobaException( "We do not currently support adding noise to a LoggedInteraction." ) noisy_context = self._noises(interaction.context, rng, self._context_noise) noisy_actions = [ self._noises(a, rng, self._action_noise) for a in interaction.actions ] noisy_kwargs = {} if 'rewards' in interaction.kwargs and self._reward_noise: noisy_kwargs['rewards'] = self._noises( interaction.kwargs['rewards'], rng, self._reward_noise) yield SimulatedInteraction(noisy_context, noisy_actions, **noisy_kwargs)
def process( self, learner: Learner, interactions: Iterable[SimulatedInteraction] ) -> Iterable[Dict[Any, Any]]: random = CobaRandom(self._seed) if not isinstance(learner, SafeLearner): learner = SafeLearner(learner) if not interactions: return for interaction in interactions: InteractionContext.learner_info.clear() context = interaction.context actions = interaction.actions start_time = time.time() probs, info = learner.predict(context, actions) predict_time = time.time() - start_time action = random.choice(actions, probs) reveal = interaction.kwargs.get( "reveals", interaction.kwargs.get("rewards"))[actions.index(action)] prob = probs[actions.index(action)] start_time = time.time() learner.learn(context, action, reveal, prob, info) learn_time = time.time() - start_time learner_info = InteractionContext.learner_info interaction_info = {} for k, v in interaction.kwargs.items(): if isinstance(v, collections.abc.Sequence) and not isinstance( v, str): interaction_info[k] = v[actions.index(action)] else: interaction_info[k] = v time_info = { "predict_time": predict_time, "learn_time": learn_time } if self._time else {} yield {**interaction_info, **learner_info, **time_info}
def __init__(self, learners: Sequence[Learner], eta : float = 0.075, T : float = math.inf, mode : Literal["importance","rejection","off-policy"] ="importance", seed : int = 1) -> None: """Instantiate a CorralLearner. Args: learners: The collection of base learners. eta: The learning rate. This controls how quickly Corral picks a best base_learner. T: The number of interactions expected during the learning process. A small T will cause the learning rate to shrink towards 0 quickly while a large value for T will cause the learning rate to shrink towards 0 slowly. A value of inf means that the learning rate will remain constant. mode: Determines the method with which feedback is provided to the base learners. The original paper used importance sampling. We also support `off-policy` and `rejection`. seed: A seed for a random number generation in ordre to get repeatable results. """ if mode not in ["importance", "off-policy", "rejection"]: raise CobaException("The provided `mode` for CorralLearner was unrecognized.") self._base_learners = [ SafeLearner(learner) for learner in learners] M = len(self._base_learners) self._T = T self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._mode = mode self._random_pick = CobaRandom(seed) self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000))
def read(self) -> Iterable[SimulatedInteraction]: items = list(self._source.read()) if not items: return [] features, labels = zip(*items) if self._label_type == "R": max_n_actions = 10 #Scale the labels so their range is 1. min_l, max_l = min(labels), max(labels) labels = [ float(l) / (max_l - min_l) - (min_l / (max_l - min_l)) for l in labels ] if len(labels) <= max_n_actions: actions = labels else: actions = percentile(labels, [ i / (max_n_actions + 1) for i in range(1, max_n_actions + 1) ]) values = dict(zip(OneHotEncoder().fit_encodes(actions), actions)) actions = list(values.keys()) reward = lambda action, label: 1 - abs(values[action] - float(label )) else: #how can we tell the difference between featurized labels and multilabels???? #for now we will assume multilables will be passed in as arrays not tuples... if not isinstance(labels[0], collections.abc.Hashable): actions = list(chain.from_iterable(labels)) else: actions = list(labels) is_label = lambda action, label: action == label in_multilabel = lambda action, label: isinstance( label, collections.abc.Sequence) and action in label reward = lambda action, label: int( is_label(action, label) or in_multilabel(action, label)) contexts = features actions = CobaRandom(1).shuffle(sorted(set(actions))) rewards = [[reward(action, label) for action in actions] for label in labels] for c, a, r in zip(contexts, repeat(actions), rewards): yield SimulatedInteraction(c, a, rewards=r)
class Warm(EnvironmentFilter): """Turn a SimulatedEnvironment into a WarmStartEnvironment.""" def __init__(self, n_warm: int, seed: int = 1): """Instantiate a Warm filter. Args: n_warm: The number of interactions that should be turned into LoggedInteractions. seed: The random number seed that determines the random logging policy for LoggedInteractions. """ self._n_warm = n_warm self._seed = seed @property def params(self) -> Dict[str, Any]: return {"n_warm": self._n_warm} def filter( self, interactions: Iterable[SimulatedInteraction] ) -> Iterable[Interaction]: self._rng = CobaRandom(self._seed) underlying_iterable = iter(interactions) logged_interactions = map(self._to_logged_interaction, islice(underlying_iterable, self._n_warm)) simulated_interactions = underlying_iterable return chain(logged_interactions, simulated_interactions) def _to_logged_interaction( self, interaction: SimulatedInteraction) -> LoggedInteraction: num_actions = len(interaction.actions) probabilities = [1 / num_actions] * num_actions selected_index = self._rng.choice(list(range(num_actions)), probabilities) selected_action = interaction.actions[selected_index] selected_probability = probabilities[selected_index] kwargs = { "probability": selected_probability, "actions": interaction.actions } if "reveals" in interaction.kwargs: kwargs["reveal"] = interaction.kwargs["reveals"][selected_index] if "rewards" in interaction.kwargs: kwargs["reward"] = interaction.kwargs["rewards"][selected_index] return LoggedInteraction(interaction.context, selected_action, **kwargs)
def test_rejection_learn(self): actions = [0, 1] base1 = ReceivedLearnFixedLearner([1 / 2, 1 / 2], 'a') base2 = ReceivedLearnFixedLearner([1 / 4, 3 / 4], 'b') learner = CorralLearner([base1, base2], eta=0.5, mode="rejection") predict, info = learner.predict(None, actions) action = actions[0] probability = predict[0] reward = 1 base1_learn_cnt = [0, 0] base2_learn_cnt = [0, 0] random = CobaRandom(1) for _ in range(1000): action = random.choice(actions, predict) probability = predict[actions.index(action)] learner.learn(None, action, reward, probability, info) base1_learn_cnt[action] += int(base1.received_learn is not None) base2_learn_cnt[action] += int(base2.received_learn is not None) base1.received_learn = None base2.received_learn = None self.assertLessEqual( abs(base1_learn_cnt[0] / sum(base1_learn_cnt) - 1 / 2), .02) self.assertLessEqual( abs(base1_learn_cnt[1] / sum(base1_learn_cnt) - 1 / 2), .02) self.assertLessEqual( abs(base2_learn_cnt[0] / sum(base2_learn_cnt) - 1 / 4), .02) self.assertLessEqual( abs(base2_learn_cnt[1] / sum(base2_learn_cnt) - 3 / 4), .02)
def read(self) -> Iterable[SimulatedInteraction]: rng = None if not self._make_rng else CobaRandom(self._seed) _context = lambda i: self._context(i, rng) if rng else self._context(i) _actions = lambda i, c: self._actions( i, c, rng) if rng else self._actions(i, c) _reward = lambda i, c, a: self._reward( i, c, a, rng) if rng else self._reward(i, c, a) for i in islice(count(), self._n_interactions): context = _context(i) actions = _actions(i, context) rewards = [_reward(i, context, action) for action in actions] yield SimulatedInteraction(context, actions, rewards=rewards)
def test_regression_learning(self): vw = VowpalMediator().init_learner("--quiet", 1) n_features = 10 n_examples = 1000 rng = CobaRandom(1) weights = rng.randoms(n_features) rows = [rng.randoms(n_features) for _ in range(n_examples)] labels = [sum([w * r for w, r in zip(weights, row)]) for row in rows] examples = list(zip(rows, labels)) self.assertEqual(0, vw.predict(vw.make_example({'x': rows[0]}, None))) pred_errs = [] for row, label in examples[int(.9 * n_examples):]: pred_errs.append( vw.predict(vw.make_example({"x": row}, None)) - label) pre_learn_mse = sum([e**2 for e in pred_errs]) // len(pred_errs) for row, label in examples[0:int(.9 * n_examples)]: vw.learn(vw.make_example({"x": row}, str(label))) pred_errs = [] for row, label in examples[int(.9 * n_examples):]: pred_errs.append( vw.predict(vw.make_example({"x": row}, None)) - label) post_learn_mse = sum([e**2 for e in pred_errs]) / len(pred_errs) self.assertNotAlmostEqual(0, pre_learn_mse, places=2) self.assertAlmostEqual(0, post_learn_mse, places=2)
class BenchmarkLearner: @property def family(self) -> str: try: return self._learner.family except AttributeError: return self._learner.__class__.__name__ @property def params(self) -> Dict[str, Any]: try: return self._learner.params except AttributeError: return {} @property def full_name(self) -> str: if len(self.params) > 0: return f"{self.family}({','.join(f'{k}={v}' for k,v in self.params.items())})" else: return self.family def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None: self._learner = learner self._random = CobaRandom(seed) def init(self) -> None: try: self._learner.init() except AttributeError: pass def choose(self, key: Key, context: Context, actions: Sequence[Action]) -> Tuple[Choice, float]: p = self._learner.predict(key, context, actions) c = self._random.choice(list(range(len(actions))), p) return c, p[c] def learn(self, key: Key, context: Context, action: Action, reward: Reward, probability: float) -> None: self._learner.learn(key, context, action, reward, probability)
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, n_exemplars: int = 10, kernel: Literal['linear', 'polynomial', 'exponential'] = 'exponential', degree: int = 2, gamma: float = 1, seed: int = 1) -> None: """Instantiate a KernelSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. n_exemplars: The number of exemplar action, context pairs. kernel: The family of the kernel basis functions. degree: This argument is only relevant when using polynomial kernels. gamma: This argument is only relevant when using exponential kernels. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, n_exemplars, kernel, degree, gamma, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._n_exemplars = n_exemplars self._seed = seed self._kernel = kernel self._degree = degree self._gamma = gamma rng = CobaRandom(seed) #if there are no features then we are unable to define exemplars if n_action_features + n_context_features == 0: n_exemplars = 0 feat_gen = lambda n: tuple(rng.gausses(n, 0, .75)) one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions)) self._exemplars = [[ feat_gen(n_action_features + n_context_features) for _ in range(n_exemplars) ] for _ in range(1 if n_action_features else n_actions)] weight_count = n_actions if n_exemplars == 0 else n_exemplars self._weights = [1 - 2 * w for w in rng.randoms(weight_count)] self._bias = 0 if kernel == 'polynomial': #this ensures the dot-product between F and an exemplar is in [0,upper_bound] #This ensures that higher-order polynomials will remain reasonably well behaved upper_bound = (1.5)**(1 / degree) - 1 self._exemplars = [[[upper_bound * ee / sum(e) for ee in e] for e in E] for E in self._exemplars] def context(index: int) -> Context: return feat_gen(n_context_features) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: return [feat_gen(n_action_features) for _ in range(n_actions) ] if n_action_features else one_hot_acts def reward(index: int, context: Context, action: Action) -> float: if n_exemplars == 0: return self._bias + self._weights[action.index(1)] #handles None context context = context or [] if n_action_features: f = list(context) + list(action) W = self._weights E = self._exemplars[0] else: f = list(context) W = self._weights E = self._exemplars[action.index(1)] if kernel == "linear": K = lambda x1, x2: self._linear_kernel(x1, x2) if kernel == "polynomial": K = lambda x1, x2: self._polynomial_kernel( x1, x2, self._degree) if kernel == "exponential": K = lambda x1, x2: self._exponential_kernel( x1, x2, self._gamma) return self._bias + sum([w * K(e, f) for w, e in zip(W, E)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._weights = [w / s for w in self._weights] super().__init__(n_interactions, context, actions, reward)
def _process_chunk(self, task_group: Iterable[BenchmarkTask]) -> Iterable[Any]: source_by_id = {t.src_id: t.simulation.source for t in task_group} filter_by_id = {t.sim_id: t.simulation.filter for t in task_group} srt_src = lambda t: t.src_id grp_src = lambda t: t.src_id srt_sim = lambda t: t.sim_id grp_sim = lambda t: t.sim_id with CobaConfig.Logger.log(f"Processing chunk..."): for src_id, tasks_by_src in groupby(sorted(task_group, key=srt_src), key=grp_src): try: with CobaConfig.Logger.time( f"Creating source {src_id} from {source_by_id[src_id]}..." ): #Rhis is not ideal. I'm not sure how it should be improved and leaving this for now. loaded_source = list(source_by_id[src_id].read()) for sim_id, tasks_by_src_sim in groupby(sorted( tasks_by_src, key=srt_sim), key=grp_sim): tasks_by_src_sim_list = list(tasks_by_src_sim) learner_ids = [t.lrn_id for t in tasks_by_src_sim_list] learners = [t.learner for t in tasks_by_src_sim_list] seeds = [t.seed for t in tasks_by_src_sim_list] learner_ids.reverse() learners.reverse() with CobaConfig.Logger.time( f"Creating simulation {sim_id} from source {src_id}..." ): interactions = filter_by_id[sim_id].filter( loaded_source) if not interactions: CobaConfig.Logger.log( f"Simulation {sim_id} has nothing to evaluate (likely due to `take` being larger than the simulation)." ) continue for index in sorted(range(len(learners)), reverse=True): lrn_id = learner_ids[index] learner = deepcopy(learners[index]) random = CobaRandom(seeds[index]) try: with CobaConfig.Logger.time( f"Evaluating learner {lrn_id} on Simulation {sim_id}..." ): row_data = defaultdict(list) for i, interaction in enumerate( interactions): probs = learner.predict( i, interaction.context, interaction.actions) assert abs( sum(probs) - 1 ) < .0001, "The learner returned invalid proabilities for action choices." action = random.choice( interaction.actions, probs) reward = interaction.feedbacks[ interaction.actions.index(action)] prob = probs[interaction.actions.index( action)] info = learner.learn( i, interaction.context, action, reward, prob) or {} for key, value in info.items() | { ('reward', reward) }: row_data[key].append(value) yield Transaction.interactions( sim_id, lrn_id, _packed=row_data) except Exception as e: CobaConfig.Logger.log_exception(e) finally: del learner_ids[index] del learners[index] except Exception as e: CobaConfig.Logger.log_exception(e)
class CorralLearner(Learner): """A meta-learner that takes a collection of learners and determines which is best in an environment. This is an implementation of the Agarwal et al. (2017) Corral algorithm and requires that the reward is always in [0,1]. References: Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. "Corralling a band of bandit algorithms." In Conference on Learning Theory, pp. 12-38. PMLR, 2017. """ def __init__(self, learners: Sequence[Learner], eta : float = 0.075, T : float = math.inf, mode : Literal["importance","rejection","off-policy"] ="importance", seed : int = 1) -> None: """Instantiate a CorralLearner. Args: learners: The collection of base learners. eta: The learning rate. This controls how quickly Corral picks a best base_learner. T: The number of interactions expected during the learning process. A small T will cause the learning rate to shrink towards 0 quickly while a large value for T will cause the learning rate to shrink towards 0 slowly. A value of inf means that the learning rate will remain constant. mode: Determines the method with which feedback is provided to the base learners. The original paper used importance sampling. We also support `off-policy` and `rejection`. seed: A seed for a random number generation in ordre to get repeatable results. """ if mode not in ["importance", "off-policy", "rejection"]: raise CobaException("The provided `mode` for CorralLearner was unrecognized.") self._base_learners = [ SafeLearner(learner) for learner in learners] M = len(self._base_learners) self._T = T self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._mode = mode self._random_pick = CobaRandom(seed) self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000)) @property def params(self) -> Dict[str, Any]: return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed } def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]: base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ] base_predicts, base_infos = zip(*base_predicts) if self._mode in ["importance"]: base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts ] base_probs = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ] predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ] info = (base_actions, base_probs, base_infos, base_predicts, actions, predict) if self._mode in ["off-policy", "rejection"]: predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ] info = (None, None, base_infos, base_predicts, actions, predict) return (predict, info) def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None: assert 0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1" base_actions = info[0] base_probs = info[1] base_infos = info[2] base_preds = info[3] actions = info[4] predict = info[5] if self._mode == "importance": # This is what is in the original paper. It has the following characteristics: # > It is able to provide feedback to every base learner on every iteration # > It uses a reward estimator with higher variance and no bias (aka, importance sampling) # > It is "on-policy" with respect to base learner's prediction distributions # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A] for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos): R = reward * int(A==action)/probability learner.learn(context, A, R, P, base_info) if self._mode == "off-policy": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It is able to provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) # > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts). for learner, base_info in zip(self._base_learners, base_infos): learner.learn(context, action, reward, probability, base_info) if self._mode == "rejection": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It doesn't necessarily provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback # > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts). p = self._random_reject.random() #can I reuse this across all learners like this??? I think so??? for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds): f = lambda a: base_predict[actions.index(a)] #the PMF we want g = lambda a: predict[actions.index(a)] #the PMF we have M = max([f(A)/g(A) for A in actions if g(A) > 0]) if p <= f(action)/(M*g(action)): learner.learn(context, action, reward, f(action), base_info) # Instant loss is an unbiased estimate of E[loss|learner] for this iteration. # Our estimate differs from the orginal Corral paper because we have access to the # action probabilities of the base learners while the Corral paper did not assume # access to this information. This information allows for a loss esimator with the same # expectation as the original Corral paper's estimator but with a lower variance. loss = 1-reward picked_index = actions.index(action) instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ] self._ps = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas) self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ] for i in range(len(self._base_learners)): if 1/self._p_bars[i] > self._rhos[i]: self._rhos[i] = 2/self._p_bars[i] self._etas[i] *= self._beta base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) } base_pbar_data = { f"pbar_{i}" : self._p_bars[i] for i in range(len(self._base_learners)) } predict_data = { "predict" : probability, **base_predict_data, **base_pbar_data } InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data}) @staticmethod def _log_barrier_omd(ps, losses, etas) -> Sequence[float]: f = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)])) df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)])) denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ] min_loss = min(losses) max_loss = max(losses) precision = 4 def binary_search(l,r) -> Optional[float]: #in theory the above check should guarantee this has a solution while True: x = (l+r)/2 y = f(x) if round(y,precision) == 1: return x if y < 1: l = x if y > 1: r = x def find_root_of_1(): brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss])))) for l_brack, r_brack in zip(brackets[:-1], brackets[1:]): if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0: continue else: # we use binary search because newtons # method can overshoot our objective return binary_search(l_brack, r_brack) lmbda: Optional[float] = None if min_loss == max_loss: lmbda = min_loss elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1: lmbda = min_loss elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1: lmbda = max_loss else: lmbda = find_root_of_1() if lmbda is None: raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}') new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)] assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral" return new_ps
def test_cb_adf_learning(self): learner = VowpalArgsLearner() n_actions = 3 n_features = 10 n_examples = 2000 rng = CobaRandom(11111) contexts = [rng.randoms(n_features) for _ in range(n_examples)] pre_learn_rewards = [] for context in contexts[:int(.9 * n_examples)]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] pre_learn_rewards.append( rng.choice(rewards, learner.predict(context, actions)[0])) for context in contexts[:int(.9 * n_examples)]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] probs, info = learner.predict(context, actions) choice = rng.choice(list(range(3)), probs) learner.learn(context, actions[choice], rewards[choice], probs[choice], info) post_learn_rewards = [] for context in contexts[int(.9 * n_examples):]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] post_learn_rewards.append( rng.choice(rewards, learner.predict(context, actions)[0])) average_pre_learn_reward = sum(pre_learn_rewards) / len( pre_learn_rewards) average_post_learn_reward = sum(post_learn_rewards) / len( post_learn_rewards) self.assertAlmostEqual(.33, average_pre_learn_reward, places=2) self.assertAlmostEqual(.78, average_post_learn_reward, places=2)
class CorralLearner(Learner): """This is an implementation of the Agarwal et al. (2017) Corral algorithm. This algorithm assumes that the reward distribution has support in [0,1] and implements the remark on pg. 8 to improve learning efficiency when multiple bandits select the same action. References: Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. "Corralling a band of bandit algorithms." In Conference on Learning Theory, pp. 12-38. PMLR, 2017. """ def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None: """Instantiate a CorralLearner. Args: base_learners: The collection of algorithms to use as base learners. eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best. T: The number of interactions expected during the learning process. In our experiments Corral performance seemed relatively insensitive to this value. seed: A seed for a random number generation in ordre to get repeatable results. """ self._base_learners = base_learners M = len(self._base_learners) self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._random = CobaRandom(seed) self._base_action_picks : Dict[Key, Sequence[Action]] = {} self._base_action_probs: Dict[Key, Sequence[float]] = {} @property def family(self) -> str: """The family of the learner. See the base class for more information """ return "corral" @property def params(self) -> Dict[str, Any]: """The parameters of the learner. See the base class for more information """ return {"eta": self._eta_init, "B": [ b.family for b in self._base_learners ] } def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]: """Determine a PMF with which to select the given actions. Args: key: The key identifying the interaction we are choosing for. context: The context we're currently in. See the base class for more information. actions: The actions to choose from. See the base class for more information. Returns: The probability of taking each action. See the base class for more information. """ base_predicts = [ base_algorithm.predict(key, context, actions) for base_algorithm in self._base_learners ] base_action_picks = [ self._random.choice(actions, predict) for predict in base_predicts ] base_action_probs = [ predict[actions.index(action)] for action,predict in zip(base_action_picks,base_predicts) ] self._base_action_picks[key] = base_action_picks self._base_action_probs[key] = base_action_probs return [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_action_picks)]) for a in actions ] def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None: """Learn from the given interaction. Args: key: The key identifying the interaction this observed reward came from. context: The context we're learning about. See the base class for more information. action: The action that was selected in the context. See the base class for more information. reward: The reward that was gained from the action. See the base class for more information. probability: The probability that the given action was taken. """ loss = 1-reward assert 0 <= loss and loss <= 1, "The current Corral implementation assumes a loss between 0 and 1" base_action_picks = self._base_action_picks.pop(key) base_action_probs = self._base_action_probs.pop(key) losses = [ loss/probability * int(act==action) for act in base_action_picks ] rewards = [ reward/probability * int(act==action) for act in base_action_picks ] for learner, action, R, P in zip(self._base_learners, base_action_picks, rewards, base_action_probs): learner.learn(key, context, action, R, P) # COBA learners assume a reward self._ps = list(self._log_barrier_omd(losses)) self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ] for i in range(len(self._base_learners)): if 1/self._p_bars[i] > self._rhos[i]: self._rhos[i] = 2/self._p_bars[i] self._etas[i] *= self._beta def _log_barrier_omd(self, losses) -> Sequence[float]: f = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(self._ps, self._etas, losses)])) df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(self._ps, self._etas, losses)])) denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(self._ps, self._etas, losses) ] min_loss = min(losses) max_loss = max(losses) precision = 4 def newtons_zero(l,r) -> Optional[float]: """Use Newton's method to calculate the root.""" #depending on scales this check may fail though that seems unlikely if (f(l+.0001)-1) * (f(r-.00001)-1) >= 0: return None i = 0 x = (l+r)/2 while True: i += 1 if df(x) == 0: raise Exception(f'Something went wrong in Corral (0) {self._ps}, {self._etas}, {losses}, {x}') x -= (f(x)-1)/df(x) if round(f(x),precision) == 1: return x if (i % 30000) == 0: print(i) lmbda: Optional[float] = None if min_loss == max_loss: lmbda = min_loss elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1: lmbda = min_loss elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1: lmbda = max_loss else: brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss])))) for l_brack, r_brack in zip(brackets[:-1], brackets[1:]): lmbda = newtons_zero(l_brack, r_brack) if lmbda is not None: break if lmbda is None: raise Exception(f'Something went wrong in Corral (None) {self._ps}, {self._etas}, {losses}') return [ max(1/((1/p) + eta*(loss-lmbda)),.00001) for p, eta, loss in zip(self._ps, self._etas, losses)]
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, reward_features: Sequence[str] = ["a", "xa"], seed: int = 1) -> None: """Instantiate a LinearSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. reward_features: The features in the simulation's linear reward function. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, reward_features, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._reward_features = reward_features self._seed = seed if not self._n_context_features: reward_features = list( set(filter(None, [f.replace('x', '') for f in reward_features]))) if not self._n_action_features: reward_features = list( set(filter(None, [f.replace('a', '') for f in reward_features]))) rng = CobaRandom(seed) feat_encoder = InteractionsEncoder(reward_features) #to try and make sure high-order polynomials are well behaved #we center our context and action features on 1 and give them #a very small amount of variance. Then, in post processing, we #shift and re-scale our reward to center and fill in [0,1]. max_degree = max([len(f) for f in reward_features]) if reward_features else 1 feat_gen = lambda n: tuple([ g * rng.choice([1, -1]) for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree)) ]) one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions)) feature_count = len( feat_encoder.encode(x=[1] * n_context_features, a=[1] * n_action_features)) weight_parts = 1 if n_action_features else n_actions weight_count = 1 if feature_count == 0 else feature_count self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)] for _ in range(weight_parts)] self._bias = 0 self._clip = False def context(index: int) -> Context: return feat_gen(n_context_features) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: return [feat_gen(n_action_features) for _ in range(n_actions) ] if n_action_features else one_hot_acts def reward(index: int, context: Context, action: Action) -> float: F = feat_encoder.encode(x=context, a=action) or [1] W = self._weights[0 if n_action_features else action.index(1)] return self._bias + sum([w * f for w, f in zip(W, F)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._weights = [[w / s for w in W] for W in self._weights] self._clip = True super().__init__(n_interactions, context, actions, reward)
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, n_neighborhoods: int = 10, seed: int = 1) -> None: """Instantiate a NeighborsSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. n_neighborhoods: The number of neighborhoods the simulation should have. seed: The random number seed used to generate all contexts and action rewards. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, n_neighborhoods, seed) self._n_interactions = n_interactions self._n_actions = n_actions self._n_context_feats = n_context_features self._n_action_feats = n_action_features self._n_neighborhoods = n_neighborhoods self._seed = seed rng = CobaRandom(self._seed) def context_gen(): return tuple(rng.gausses(n_context_features, 0, 1)) if n_context_features else None def actions_gen(): if not n_action_features: return OneHotEncoder().fit_encodes(range(n_actions)) else: return [ tuple(rng.gausses(n_action_features, 0, 1)) for _ in range(n_actions) ] contexts = list( set([context_gen() for _ in range(self._n_neighborhoods)])) context_actions = {c: actions_gen() for c in contexts} context_action_rewards = {(c, a): rng.random() for c in contexts for a in context_actions[c]} context_iter = iter(islice(cycle(contexts), n_interactions)) def context(index: int): return next(context_iter) def actions(index: int, context: Tuple[float, ...]): return context_actions[context] def reward(index: int, context: Tuple[float, ...], action: Tuple[int, ...]): return context_action_rewards[(context, action)] return super().__init__(self._n_interactions, context, actions, reward)
def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None: self._learner = learner self._random = CobaRandom(seed)
def __init__(self, n_interactions: int = 500, n_actions: int = 10, n_features: int = 10, context_features: bool = True, action_features: bool = True, sparse: bool = False, seed: int = 1) -> None: self._n_bandits = n_actions self._n_features = n_features self._context_features = context_features self._action_features = action_features self._seed = seed r = CobaRandom(seed) context: Callable[[int], Context] actions: Callable[[int, Context], Sequence[Action]] rewards: Callable[[int, Context, Action], float] sparsify = lambda x: (tuple(range(len(x))), tuple(x) ) if sparse else tuple(x) unsparse = lambda x: x[1] if sparse else x normalize = lambda X: [x / sum(X) for x in X] if not context_features and not action_features: means = [ m / n_actions + 1 / (2 * n_actions) for m in r.randoms(n_actions) ] actions_features = [] for i in range(n_actions): action = [0] * n_actions action[i] = 1 actions_features.append(tuple(action)) context = lambda i: None actions = lambda i, c: sparsify(actions_features) rewards = lambda i, c, a: means[unsparse(a).index(1)] + (r.random( ) - .5) / n_actions if context_features and not action_features: #normalizing allows us to make sure our reward is in [0,1] bandit_thetas = [r.randoms(n_features) for _ in range(n_actions)] theta_totals = [sum(theta) for theta in bandit_thetas] bandit_thetas = [[ t / norm for t in theta ] for theta, norm in zip(bandit_thetas, theta_totals)] actions_features = [] for i in range(n_actions): action = [0] * n_actions action[i] = 1 actions_features.append(tuple(action)) context = lambda i: sparsify(r.randoms(n_features)) actions = lambda i, c: [sparsify(af) for af in actions_features] rewards = lambda i, c, a: sum([ cc * t for cc, t in zip(unsparse(c), bandit_thetas[unsparse(a). index(1)]) ]) if not context_features and action_features: theta = r.randoms(n_features) context = lambda i: None actions = lambda i, c: [ sparsify(normalize(r.randoms(n_features))) for _ in range(r.randint(2, 10)) ] rewards = lambda i, c, a: float( sum([cc * t for cc, t in zip(theta, unsparse(a))])) if context_features and action_features: context = lambda i: sparsify(r.randoms(n_features)) actions = lambda i, c: [ sparsify(normalize(r.randoms(n_features))) for _ in range(r.randint(2, 10)) ] rewards = lambda i, c, a: sum( [cc * t for cc, t in zip(unsparse(c), unsparse(a))]) super().__init__(n_interactions, context, actions, rewards)
def filter(self, items: Iterable[Any]) -> Sequence[Any]: return CobaRandom(self._seed).shuffle(list(items))
def filter(self, interactions: Iterable[Interaction]) -> Iterable[Interaction]: return CobaRandom(self._seed).shuffle(list(interactions))
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, seed: int = 1) -> None: """Instantiate an MLPSythenticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._seed = seed rng = CobaRandom(seed) input_layer_size = n_context_features + n_action_features hidden_layer_size = 50 self._bias = 0 if input_layer_size: hidden_weights = [[ rng.gausses(input_layer_size, 0, 1.5) for _ in range(hidden_layer_size) ] for _ in range(1 if n_action_features else n_actions)] hidden_activation = lambda x: 1 / (1 + math.exp(-x) ) #sigmoid activation hidden_output = lambda inputs, weights: hidden_activation( sum([i * w for i, w in zip(inputs, weights)])) self._output_weights = rng.gausses(hidden_layer_size) else: self._output_weights = rng.gausses(n_actions) def context(index: int) -> Context: return tuple(rng.gausses( n_context_features)) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: if n_action_features: return [(rng.gausses(n_action_features)) for _ in range(n_actions)] else: return OneHotEncoder().fit_encodes(range(n_actions)) def reward(index: int, context: Context, action: Action) -> float: #handles None context context = context or [] if not n_action_features and not n_context_features: return self._bias + self._output_weights[action.index(1)] if n_action_features: I = list(context) + list(action) W = self._output_weights H = hidden_weights[0] else: I = list(context) W = self._output_weights H = hidden_weights[action.index(1)] hidden_outputs = [hidden_output(I, h) for h in H] return self._bias + sum( [w * hout for w, hout in zip(W, hidden_outputs)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._output_weights = [w / s for w in self._output_weights] super().__init__(n_interactions, context, actions, reward)