class Warm(EnvironmentFilter): """Turn a SimulatedEnvironment into a WarmStartEnvironment.""" def __init__(self, n_warm: int, seed: int = 1): """Instantiate a Warm filter. Args: n_warm: The number of interactions that should be turned into LoggedInteractions. seed: The random number seed that determines the random logging policy for LoggedInteractions. """ self._n_warm = n_warm self._seed = seed @property def params(self) -> Dict[str, Any]: return {"n_warm": self._n_warm} def filter( self, interactions: Iterable[SimulatedInteraction] ) -> Iterable[Interaction]: self._rng = CobaRandom(self._seed) underlying_iterable = iter(interactions) logged_interactions = map(self._to_logged_interaction, islice(underlying_iterable, self._n_warm)) simulated_interactions = underlying_iterable return chain(logged_interactions, simulated_interactions) def _to_logged_interaction( self, interaction: SimulatedInteraction) -> LoggedInteraction: num_actions = len(interaction.actions) probabilities = [1 / num_actions] * num_actions selected_index = self._rng.choice(list(range(num_actions)), probabilities) selected_action = interaction.actions[selected_index] selected_probability = probabilities[selected_index] kwargs = { "probability": selected_probability, "actions": interaction.actions } if "reveals" in interaction.kwargs: kwargs["reveal"] = interaction.kwargs["reveals"][selected_index] if "rewards" in interaction.kwargs: kwargs["reward"] = interaction.kwargs["rewards"][selected_index] return LoggedInteraction(interaction.context, selected_action, **kwargs)
def process( self, learner: Learner, interactions: Iterable[SimulatedInteraction] ) -> Iterable[Dict[Any, Any]]: random = CobaRandom(self._seed) if not isinstance(learner, SafeLearner): learner = SafeLearner(learner) if not interactions: return for interaction in interactions: InteractionContext.learner_info.clear() context = interaction.context actions = interaction.actions start_time = time.time() probs, info = learner.predict(context, actions) predict_time = time.time() - start_time action = random.choice(actions, probs) reveal = interaction.kwargs.get( "reveals", interaction.kwargs.get("rewards"))[actions.index(action)] prob = probs[actions.index(action)] start_time = time.time() learner.learn(context, action, reveal, prob, info) learn_time = time.time() - start_time learner_info = InteractionContext.learner_info interaction_info = {} for k, v in interaction.kwargs.items(): if isinstance(v, collections.abc.Sequence) and not isinstance( v, str): interaction_info[k] = v[actions.index(action)] else: interaction_info[k] = v time_info = { "predict_time": predict_time, "learn_time": learn_time } if self._time else {} yield {**interaction_info, **learner_info, **time_info}
class BenchmarkLearner: @property def family(self) -> str: try: return self._learner.family except AttributeError: return self._learner.__class__.__name__ @property def params(self) -> Dict[str, Any]: try: return self._learner.params except AttributeError: return {} @property def full_name(self) -> str: if len(self.params) > 0: return f"{self.family}({','.join(f'{k}={v}' for k,v in self.params.items())})" else: return self.family def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None: self._learner = learner self._random = CobaRandom(seed) def init(self) -> None: try: self._learner.init() except AttributeError: pass def choose(self, key: Key, context: Context, actions: Sequence[Action]) -> Tuple[Choice, float]: p = self._learner.predict(key, context, actions) c = self._random.choice(list(range(len(actions))), p) return c, p[c] def learn(self, key: Key, context: Context, action: Action, reward: Reward, probability: float) -> None: self._learner.learn(key, context, action, reward, probability)
def test_rejection_learn(self): actions = [0, 1] base1 = ReceivedLearnFixedLearner([1 / 2, 1 / 2], 'a') base2 = ReceivedLearnFixedLearner([1 / 4, 3 / 4], 'b') learner = CorralLearner([base1, base2], eta=0.5, mode="rejection") predict, info = learner.predict(None, actions) action = actions[0] probability = predict[0] reward = 1 base1_learn_cnt = [0, 0] base2_learn_cnt = [0, 0] random = CobaRandom(1) for _ in range(1000): action = random.choice(actions, predict) probability = predict[actions.index(action)] learner.learn(None, action, reward, probability, info) base1_learn_cnt[action] += int(base1.received_learn is not None) base2_learn_cnt[action] += int(base2.received_learn is not None) base1.received_learn = None base2.received_learn = None self.assertLessEqual( abs(base1_learn_cnt[0] / sum(base1_learn_cnt) - 1 / 2), .02) self.assertLessEqual( abs(base1_learn_cnt[1] / sum(base1_learn_cnt) - 1 / 2), .02) self.assertLessEqual( abs(base2_learn_cnt[0] / sum(base2_learn_cnt) - 1 / 4), .02) self.assertLessEqual( abs(base2_learn_cnt[1] / sum(base2_learn_cnt) - 3 / 4), .02)
class CorralLearner(Learner): """This is an implementation of the Agarwal et al. (2017) Corral algorithm. This algorithm assumes that the reward distribution has support in [0,1] and implements the remark on pg. 8 to improve learning efficiency when multiple bandits select the same action. References: Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. "Corralling a band of bandit algorithms." In Conference on Learning Theory, pp. 12-38. PMLR, 2017. """ def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None: """Instantiate a CorralLearner. Args: base_learners: The collection of algorithms to use as base learners. eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best. T: The number of interactions expected during the learning process. In our experiments Corral performance seemed relatively insensitive to this value. seed: A seed for a random number generation in ordre to get repeatable results. """ self._base_learners = base_learners M = len(self._base_learners) self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._random = CobaRandom(seed) self._base_action_picks : Dict[Key, Sequence[Action]] = {} self._base_action_probs: Dict[Key, Sequence[float]] = {} @property def family(self) -> str: """The family of the learner. See the base class for more information """ return "corral" @property def params(self) -> Dict[str, Any]: """The parameters of the learner. See the base class for more information """ return {"eta": self._eta_init, "B": [ b.family for b in self._base_learners ] } def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]: """Determine a PMF with which to select the given actions. Args: key: The key identifying the interaction we are choosing for. context: The context we're currently in. See the base class for more information. actions: The actions to choose from. See the base class for more information. Returns: The probability of taking each action. See the base class for more information. """ base_predicts = [ base_algorithm.predict(key, context, actions) for base_algorithm in self._base_learners ] base_action_picks = [ self._random.choice(actions, predict) for predict in base_predicts ] base_action_probs = [ predict[actions.index(action)] for action,predict in zip(base_action_picks,base_predicts) ] self._base_action_picks[key] = base_action_picks self._base_action_probs[key] = base_action_probs return [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_action_picks)]) for a in actions ] def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None: """Learn from the given interaction. Args: key: The key identifying the interaction this observed reward came from. context: The context we're learning about. See the base class for more information. action: The action that was selected in the context. See the base class for more information. reward: The reward that was gained from the action. See the base class for more information. probability: The probability that the given action was taken. """ loss = 1-reward assert 0 <= loss and loss <= 1, "The current Corral implementation assumes a loss between 0 and 1" base_action_picks = self._base_action_picks.pop(key) base_action_probs = self._base_action_probs.pop(key) losses = [ loss/probability * int(act==action) for act in base_action_picks ] rewards = [ reward/probability * int(act==action) for act in base_action_picks ] for learner, action, R, P in zip(self._base_learners, base_action_picks, rewards, base_action_probs): learner.learn(key, context, action, R, P) # COBA learners assume a reward self._ps = list(self._log_barrier_omd(losses)) self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ] for i in range(len(self._base_learners)): if 1/self._p_bars[i] > self._rhos[i]: self._rhos[i] = 2/self._p_bars[i] self._etas[i] *= self._beta def _log_barrier_omd(self, losses) -> Sequence[float]: f = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(self._ps, self._etas, losses)])) df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(self._ps, self._etas, losses)])) denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(self._ps, self._etas, losses) ] min_loss = min(losses) max_loss = max(losses) precision = 4 def newtons_zero(l,r) -> Optional[float]: """Use Newton's method to calculate the root.""" #depending on scales this check may fail though that seems unlikely if (f(l+.0001)-1) * (f(r-.00001)-1) >= 0: return None i = 0 x = (l+r)/2 while True: i += 1 if df(x) == 0: raise Exception(f'Something went wrong in Corral (0) {self._ps}, {self._etas}, {losses}, {x}') x -= (f(x)-1)/df(x) if round(f(x),precision) == 1: return x if (i % 30000) == 0: print(i) lmbda: Optional[float] = None if min_loss == max_loss: lmbda = min_loss elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1: lmbda = min_loss elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1: lmbda = max_loss else: brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss])))) for l_brack, r_brack in zip(brackets[:-1], brackets[1:]): lmbda = newtons_zero(l_brack, r_brack) if lmbda is not None: break if lmbda is None: raise Exception(f'Something went wrong in Corral (None) {self._ps}, {self._etas}, {losses}') return [ max(1/((1/p) + eta*(loss-lmbda)),.00001) for p, eta, loss in zip(self._ps, self._etas, losses)]
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, reward_features: Sequence[str] = ["a", "xa"], seed: int = 1) -> None: """Instantiate a LinearSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. reward_features: The features in the simulation's linear reward function. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, reward_features, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._reward_features = reward_features self._seed = seed if not self._n_context_features: reward_features = list( set(filter(None, [f.replace('x', '') for f in reward_features]))) if not self._n_action_features: reward_features = list( set(filter(None, [f.replace('a', '') for f in reward_features]))) rng = CobaRandom(seed) feat_encoder = InteractionsEncoder(reward_features) #to try and make sure high-order polynomials are well behaved #we center our context and action features on 1 and give them #a very small amount of variance. Then, in post processing, we #shift and re-scale our reward to center and fill in [0,1]. max_degree = max([len(f) for f in reward_features]) if reward_features else 1 feat_gen = lambda n: tuple([ g * rng.choice([1, -1]) for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree)) ]) one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions)) feature_count = len( feat_encoder.encode(x=[1] * n_context_features, a=[1] * n_action_features)) weight_parts = 1 if n_action_features else n_actions weight_count = 1 if feature_count == 0 else feature_count self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)] for _ in range(weight_parts)] self._bias = 0 self._clip = False def context(index: int) -> Context: return feat_gen(n_context_features) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: return [feat_gen(n_action_features) for _ in range(n_actions) ] if n_action_features else one_hot_acts def reward(index: int, context: Context, action: Action) -> float: F = feat_encoder.encode(x=context, a=action) or [1] W = self._weights[0 if n_action_features else action.index(1)] return self._bias + sum([w * f for w, f in zip(W, F)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._weights = [[w / s for w in W] for W in self._weights] self._clip = True super().__init__(n_interactions, context, actions, reward)
class CorralLearner(Learner): """A meta-learner that takes a collection of learners and determines which is best in an environment. This is an implementation of the Agarwal et al. (2017) Corral algorithm and requires that the reward is always in [0,1]. References: Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. "Corralling a band of bandit algorithms." In Conference on Learning Theory, pp. 12-38. PMLR, 2017. """ def __init__(self, learners: Sequence[Learner], eta : float = 0.075, T : float = math.inf, mode : Literal["importance","rejection","off-policy"] ="importance", seed : int = 1) -> None: """Instantiate a CorralLearner. Args: learners: The collection of base learners. eta: The learning rate. This controls how quickly Corral picks a best base_learner. T: The number of interactions expected during the learning process. A small T will cause the learning rate to shrink towards 0 quickly while a large value for T will cause the learning rate to shrink towards 0 slowly. A value of inf means that the learning rate will remain constant. mode: Determines the method with which feedback is provided to the base learners. The original paper used importance sampling. We also support `off-policy` and `rejection`. seed: A seed for a random number generation in ordre to get repeatable results. """ if mode not in ["importance", "off-policy", "rejection"]: raise CobaException("The provided `mode` for CorralLearner was unrecognized.") self._base_learners = [ SafeLearner(learner) for learner in learners] M = len(self._base_learners) self._T = T self._gamma = 1/T self._beta = 1/math.exp(1/math.log(T)) self._eta_init = eta self._etas = [ eta ] * M self._rhos = [ float(2*M) ] * M self._ps = [ 1/M ] * M self._p_bars = [ 1/M ] * M self._mode = mode self._random_pick = CobaRandom(seed) self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000)) @property def params(self) -> Dict[str, Any]: return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed } def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]: base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ] base_predicts, base_infos = zip(*base_predicts) if self._mode in ["importance"]: base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts ] base_probs = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ] predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ] info = (base_actions, base_probs, base_infos, base_predicts, actions, predict) if self._mode in ["off-policy", "rejection"]: predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ] info = (None, None, base_infos, base_predicts, actions, predict) return (predict, info) def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None: assert 0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1" base_actions = info[0] base_probs = info[1] base_infos = info[2] base_preds = info[3] actions = info[4] predict = info[5] if self._mode == "importance": # This is what is in the original paper. It has the following characteristics: # > It is able to provide feedback to every base learner on every iteration # > It uses a reward estimator with higher variance and no bias (aka, importance sampling) # > It is "on-policy" with respect to base learner's prediction distributions # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A] for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos): R = reward * int(A==action)/probability learner.learn(context, A, R, P, base_info) if self._mode == "off-policy": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It is able to provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) # > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts). for learner, base_info in zip(self._base_learners, base_infos): learner.learn(context, action, reward, probability, base_info) if self._mode == "rejection": # An alternative variation to the paper is provided below. It has the following characterisitcs: # > It doesn't necessarily provide feedback to every base learner on every iteration # > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback # > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts). p = self._random_reject.random() #can I reuse this across all learners like this??? I think so??? for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds): f = lambda a: base_predict[actions.index(a)] #the PMF we want g = lambda a: predict[actions.index(a)] #the PMF we have M = max([f(A)/g(A) for A in actions if g(A) > 0]) if p <= f(action)/(M*g(action)): learner.learn(context, action, reward, f(action), base_info) # Instant loss is an unbiased estimate of E[loss|learner] for this iteration. # Our estimate differs from the orginal Corral paper because we have access to the # action probabilities of the base learners while the Corral paper did not assume # access to this information. This information allows for a loss esimator with the same # expectation as the original Corral paper's estimator but with a lower variance. loss = 1-reward picked_index = actions.index(action) instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ] self._ps = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas) self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ] for i in range(len(self._base_learners)): if 1/self._p_bars[i] > self._rhos[i]: self._rhos[i] = 2/self._p_bars[i] self._etas[i] *= self._beta base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) } base_pbar_data = { f"pbar_{i}" : self._p_bars[i] for i in range(len(self._base_learners)) } predict_data = { "predict" : probability, **base_predict_data, **base_pbar_data } InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data}) @staticmethod def _log_barrier_omd(ps, losses, etas) -> Sequence[float]: f = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)])) df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)])) denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ] min_loss = min(losses) max_loss = max(losses) precision = 4 def binary_search(l,r) -> Optional[float]: #in theory the above check should guarantee this has a solution while True: x = (l+r)/2 y = f(x) if round(y,precision) == 1: return x if y < 1: l = x if y > 1: r = x def find_root_of_1(): brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss])))) for l_brack, r_brack in zip(brackets[:-1], brackets[1:]): if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0: continue else: # we use binary search because newtons # method can overshoot our objective return binary_search(l_brack, r_brack) lmbda: Optional[float] = None if min_loss == max_loss: lmbda = min_loss elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1: lmbda = min_loss elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1: lmbda = max_loss else: lmbda = find_root_of_1() if lmbda is None: raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}') new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)] assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral" return new_ps
def _process_chunk(self, task_group: Iterable[BenchmarkTask]) -> Iterable[Any]: source_by_id = {t.src_id: t.simulation.source for t in task_group} filter_by_id = {t.sim_id: t.simulation.filter for t in task_group} srt_src = lambda t: t.src_id grp_src = lambda t: t.src_id srt_sim = lambda t: t.sim_id grp_sim = lambda t: t.sim_id with CobaConfig.Logger.log(f"Processing chunk..."): for src_id, tasks_by_src in groupby(sorted(task_group, key=srt_src), key=grp_src): try: with CobaConfig.Logger.time( f"Creating source {src_id} from {source_by_id[src_id]}..." ): #Rhis is not ideal. I'm not sure how it should be improved and leaving this for now. loaded_source = list(source_by_id[src_id].read()) for sim_id, tasks_by_src_sim in groupby(sorted( tasks_by_src, key=srt_sim), key=grp_sim): tasks_by_src_sim_list = list(tasks_by_src_sim) learner_ids = [t.lrn_id for t in tasks_by_src_sim_list] learners = [t.learner for t in tasks_by_src_sim_list] seeds = [t.seed for t in tasks_by_src_sim_list] learner_ids.reverse() learners.reverse() with CobaConfig.Logger.time( f"Creating simulation {sim_id} from source {src_id}..." ): interactions = filter_by_id[sim_id].filter( loaded_source) if not interactions: CobaConfig.Logger.log( f"Simulation {sim_id} has nothing to evaluate (likely due to `take` being larger than the simulation)." ) continue for index in sorted(range(len(learners)), reverse=True): lrn_id = learner_ids[index] learner = deepcopy(learners[index]) random = CobaRandom(seeds[index]) try: with CobaConfig.Logger.time( f"Evaluating learner {lrn_id} on Simulation {sim_id}..." ): row_data = defaultdict(list) for i, interaction in enumerate( interactions): probs = learner.predict( i, interaction.context, interaction.actions) assert abs( sum(probs) - 1 ) < .0001, "The learner returned invalid proabilities for action choices." action = random.choice( interaction.actions, probs) reward = interaction.feedbacks[ interaction.actions.index(action)] prob = probs[interaction.actions.index( action)] info = learner.learn( i, interaction.context, action, reward, prob) or {} for key, value in info.items() | { ('reward', reward) }: row_data[key].append(value) yield Transaction.interactions( sim_id, lrn_id, _packed=row_data) except Exception as e: CobaConfig.Logger.log_exception(e) finally: del learner_ids[index] del learners[index] except Exception as e: CobaConfig.Logger.log_exception(e)
def test_cb_adf_learning(self): learner = VowpalArgsLearner() n_actions = 3 n_features = 10 n_examples = 2000 rng = CobaRandom(11111) contexts = [rng.randoms(n_features) for _ in range(n_examples)] pre_learn_rewards = [] for context in contexts[:int(.9 * n_examples)]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] pre_learn_rewards.append( rng.choice(rewards, learner.predict(context, actions)[0])) for context in contexts[:int(.9 * n_examples)]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] probs, info = learner.predict(context, actions) choice = rng.choice(list(range(3)), probs) learner.learn(context, actions[choice], rewards[choice], probs[choice], info) post_learn_rewards = [] for context in contexts[int(.9 * n_examples):]: actions = [rng.randoms(n_features) for _ in range(n_actions)] rewards = [ sum([a * c for a, c in zip(action, context)]) for action in actions ] rewards = [int(r == max(rewards)) for r in rewards] post_learn_rewards.append( rng.choice(rewards, learner.predict(context, actions)[0])) average_pre_learn_reward = sum(pre_learn_rewards) / len( pre_learn_rewards) average_post_learn_reward = sum(post_learn_rewards) / len( post_learn_rewards) self.assertAlmostEqual(.33, average_pre_learn_reward, places=2) self.assertAlmostEqual(.78, average_post_learn_reward, places=2)