Python CobaRandom Exemples, coba.random.CobaRandom Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : corral.py Projet : VowpalWabbit/coba

    def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None:
        """Instantiate a CorralLearner.
        
        Args:
            base_learners: The collection of algorithms to use as base learners.
            eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best.
            T: The number of interactions expected during the learning process. In our experiments 
                Corral performance seemed relatively insensitive to this value.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """

        self._base_learners = base_learners

        M = len(self._base_learners)

        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._random   = CobaRandom(seed)

        self._base_action_picks : Dict[Key, Sequence[Action]] = {}
        self._base_action_probs: Dict[Key, Sequence[float]]  = {}

Exemple #2

0

Afficher le fichier

Fichier : filters.py Projet : anrath/coba

    def filter(self, items: Iterable[Any]) -> Sequence[Any]:

        rng = CobaRandom(self._seed)

        if self._max_count == 0:
            return []

        if self._max_count == None:
            return Take(self._count).filter(rng.shuffle(list(items)))

        W = 1
        items = iter(items)
        reservoir = rng.shuffle(list(islice(items, self._max_count)))

        try:
            while True:
                [r1, r2, r3] = rng.randoms(3)
                W = W * math.exp(math.log(r1) / (self._max_count or 1))
                S = math.floor(math.log(r2) / math.log(1 - W))
                reservoir[int(r3 * self._max_count - .001)] = next(
                    islice(items, S, S + 1))
        except StopIteration:
            pass

        return Take(self._count).filter(reservoir)

Exemple #3

0

Afficher le fichier

    def filter(self,
               interactions: Iterable[Interaction]) -> Iterable[Interaction]:

        rng = CobaRandom(self._seed)
        interactions = list(interactions)

        for i in range(int(len(interactions) / (self._spacing + 1))):
            interactions.insert(
                i * self._spacing + rng.randint(0, self._spacing),
                interactions.pop())

        return interactions

Exemple #4

0

Afficher le fichier

    def filter(
            self, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[Interaction]:

        self._rng = CobaRandom(self._seed)

        underlying_iterable = iter(interactions)
        logged_interactions = map(self._to_logged_interaction,
                                  islice(underlying_iterable, self._n_warm))
        simulated_interactions = underlying_iterable

        return chain(logged_interactions, simulated_interactions)

Exemple #5

0

Afficher le fichier

    def filter(
        self, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[SimulatedInteraction]:

        rng = CobaRandom(self._seed)

        for interaction in interactions:

            if isinstance(interaction, LoggedInteraction):
                raise CobaException(
                    "We do not currently support adding noise to a LoggedInteraction."
                )

            noisy_context = self._noises(interaction.context, rng,
                                         self._context_noise)
            noisy_actions = [
                self._noises(a, rng, self._action_noise)
                for a in interaction.actions
            ]

            noisy_kwargs = {}

            if 'rewards' in interaction.kwargs and self._reward_noise:
                noisy_kwargs['rewards'] = self._noises(
                    interaction.kwargs['rewards'], rng, self._reward_noise)

            yield SimulatedInteraction(noisy_context, noisy_actions,
                                       **noisy_kwargs)

Exemple #6

0

Afficher le fichier

Fichier : tasks.py Projet : anrath/coba

    def process(
        self, learner: Learner, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[Dict[Any, Any]]:

        random = CobaRandom(self._seed)

        if not isinstance(learner, SafeLearner): learner = SafeLearner(learner)
        if not interactions: return

        for interaction in interactions:

            InteractionContext.learner_info.clear()

            context = interaction.context
            actions = interaction.actions

            start_time = time.time()
            probs, info = learner.predict(context, actions)
            predict_time = time.time() - start_time

            action = random.choice(actions, probs)
            reveal = interaction.kwargs.get(
                "reveals",
                interaction.kwargs.get("rewards"))[actions.index(action)]
            prob = probs[actions.index(action)]

            start_time = time.time()
            learner.learn(context, action, reveal, prob, info)
            learn_time = time.time() - start_time

            learner_info = InteractionContext.learner_info
            interaction_info = {}

            for k, v in interaction.kwargs.items():
                if isinstance(v, collections.abc.Sequence) and not isinstance(
                        v, str):
                    interaction_info[k] = v[actions.index(action)]
                else:
                    interaction_info[k] = v

            time_info = {
                "predict_time": predict_time,
                "learn_time": learn_time
            } if self._time else {}

            yield {**interaction_info, **learner_info, **time_info}

Exemple #7

0

Afficher le fichier

Fichier : corral.py Projet : anrath/coba

    def __init__(self, 
        learners: Sequence[Learner], 
        eta     : float = 0.075,
        T       : float = math.inf, 
        mode    : Literal["importance","rejection","off-policy"] ="importance", 
        seed    : int = 1) -> None:
        """Instantiate a CorralLearner.

        Args:
            learners: The collection of base learners.
            eta: The learning rate. This controls how quickly Corral picks a best base_learner. 
            T: The number of interactions expected during the learning process. A small T will cause
                the learning rate to shrink towards 0 quickly while a large value for T will cause the
                learning rate to shrink towards 0 slowly. A value of inf means that the learning rate
                will remain constant.
            mode: Determines the method with which feedback is provided to the base learners. The 
                original paper used importance sampling. We also support `off-policy` and `rejection`.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """
        if mode not in ["importance", "off-policy", "rejection"]:
            raise CobaException("The provided `mode` for CorralLearner was unrecognized.")

        self._base_learners = [ SafeLearner(learner) for learner in learners]

        M = len(self._base_learners)

        self._T     = T
        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._mode = mode

        self._random_pick   = CobaRandom(seed)
        self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000))

Exemple #8

0

Afficher le fichier

    def read(self) -> Iterable[SimulatedInteraction]:

        items = list(self._source.read())

        if not items: return []

        features, labels = zip(*items)

        if self._label_type == "R":
            max_n_actions = 10

            #Scale the labels so their range is 1.
            min_l, max_l = min(labels), max(labels)
            labels = [
                float(l) / (max_l - min_l) - (min_l / (max_l - min_l))
                for l in labels
            ]

            if len(labels) <= max_n_actions:
                actions = labels
            else:
                actions = percentile(labels, [
                    i / (max_n_actions + 1)
                    for i in range(1, max_n_actions + 1)
                ])

            values = dict(zip(OneHotEncoder().fit_encodes(actions), actions))
            actions = list(values.keys())

            reward = lambda action, label: 1 - abs(values[action] - float(label
                                                                          ))
        else:
            #how can we tell the difference between featurized labels and multilabels????
            #for now we will assume multilables will be passed in as arrays not tuples...
            if not isinstance(labels[0], collections.abc.Hashable):
                actions = list(chain.from_iterable(labels))
            else:
                actions = list(labels)

            is_label = lambda action, label: action == label
            in_multilabel = lambda action, label: isinstance(
                label, collections.abc.Sequence) and action in label
            reward = lambda action, label: int(
                is_label(action, label) or in_multilabel(action, label))

        contexts = features
        actions = CobaRandom(1).shuffle(sorted(set(actions)))
        rewards = [[reward(action, label) for action in actions]
                   for label in labels]

        for c, a, r in zip(contexts, repeat(actions), rewards):
            yield SimulatedInteraction(c, a, rewards=r)

Exemple #9

0

Afficher le fichier

class Warm(EnvironmentFilter):
    """Turn a SimulatedEnvironment into a WarmStartEnvironment."""
    def __init__(self, n_warm: int, seed: int = 1):
        """Instantiate a Warm filter.

        Args:
            n_warm: The number of interactions that should be turned into LoggedInteractions.
            seed: The random number seed that determines the random logging policy for LoggedInteractions.
        """
        self._n_warm = n_warm
        self._seed = seed

    @property
    def params(self) -> Dict[str, Any]:
        return {"n_warm": self._n_warm}

    def filter(
            self, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[Interaction]:

        self._rng = CobaRandom(self._seed)

        underlying_iterable = iter(interactions)
        logged_interactions = map(self._to_logged_interaction,
                                  islice(underlying_iterable, self._n_warm))
        simulated_interactions = underlying_iterable

        return chain(logged_interactions, simulated_interactions)

    def _to_logged_interaction(
            self, interaction: SimulatedInteraction) -> LoggedInteraction:
        num_actions = len(interaction.actions)
        probabilities = [1 / num_actions] * num_actions

        selected_index = self._rng.choice(list(range(num_actions)),
                                          probabilities)
        selected_action = interaction.actions[selected_index]
        selected_probability = probabilities[selected_index]

        kwargs = {
            "probability": selected_probability,
            "actions": interaction.actions
        }

        if "reveals" in interaction.kwargs:
            kwargs["reveal"] = interaction.kwargs["reveals"][selected_index]

        if "rewards" in interaction.kwargs:
            kwargs["reward"] = interaction.kwargs["rewards"][selected_index]

        return LoggedInteraction(interaction.context, selected_action,
                                 **kwargs)

Exemple #10

0

Afficher le fichier

Fichier : test_learners_corral.py Projet : anrath/coba

    def test_rejection_learn(self):

        actions = [0, 1]
        base1 = ReceivedLearnFixedLearner([1 / 2, 1 / 2], 'a')
        base2 = ReceivedLearnFixedLearner([1 / 4, 3 / 4], 'b')
        learner = CorralLearner([base1, base2], eta=0.5, mode="rejection")
        predict, info = learner.predict(None, actions)

        action = actions[0]
        probability = predict[0]
        reward = 1

        base1_learn_cnt = [0, 0]
        base2_learn_cnt = [0, 0]

        random = CobaRandom(1)

        for _ in range(1000):

            action = random.choice(actions, predict)
            probability = predict[actions.index(action)]

            learner.learn(None, action, reward, probability, info)
            base1_learn_cnt[action] += int(base1.received_learn is not None)
            base2_learn_cnt[action] += int(base2.received_learn is not None)

            base1.received_learn = None
            base2.received_learn = None

        self.assertLessEqual(
            abs(base1_learn_cnt[0] / sum(base1_learn_cnt) - 1 / 2), .02)
        self.assertLessEqual(
            abs(base1_learn_cnt[1] / sum(base1_learn_cnt) - 1 / 2), .02)

        self.assertLessEqual(
            abs(base2_learn_cnt[0] / sum(base2_learn_cnt) - 1 / 4), .02)
        self.assertLessEqual(
            abs(base2_learn_cnt[1] / sum(base2_learn_cnt) - 3 / 4), .02)

Exemple #11

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def read(self) -> Iterable[SimulatedInteraction]:
        rng = None if not self._make_rng else CobaRandom(self._seed)

        _context = lambda i: self._context(i, rng) if rng else self._context(i)
        _actions = lambda i, c: self._actions(
            i, c, rng) if rng else self._actions(i, c)
        _reward = lambda i, c, a: self._reward(
            i, c, a, rng) if rng else self._reward(i, c, a)

        for i in islice(count(), self._n_interactions):
            context = _context(i)
            actions = _actions(i, context)
            rewards = [_reward(i, context, action) for action in actions]

            yield SimulatedInteraction(context, actions, rewards=rewards)

Exemple #12

0

Afficher le fichier

    def test_regression_learning(self):
        vw = VowpalMediator().init_learner("--quiet", 1)

        n_features = 10
        n_examples = 1000

        rng = CobaRandom(1)

        weights = rng.randoms(n_features)
        rows = [rng.randoms(n_features) for _ in range(n_examples)]
        labels = [sum([w * r for w, r in zip(weights, row)]) for row in rows]

        examples = list(zip(rows, labels))

        self.assertEqual(0, vw.predict(vw.make_example({'x': rows[0]}, None)))

        pred_errs = []
        for row, label in examples[int(.9 * n_examples):]:
            pred_errs.append(
                vw.predict(vw.make_example({"x": row}, None)) - label)

        pre_learn_mse = sum([e**2 for e in pred_errs]) // len(pred_errs)

        for row, label in examples[0:int(.9 * n_examples)]:
            vw.learn(vw.make_example({"x": row}, str(label)))

        pred_errs = []

        for row, label in examples[int(.9 * n_examples):]:
            pred_errs.append(
                vw.predict(vw.make_example({"x": row}, None)) - label)

        post_learn_mse = sum([e**2 for e in pred_errs]) / len(pred_errs)

        self.assertNotAlmostEqual(0, pre_learn_mse, places=2)
        self.assertAlmostEqual(0, post_learn_mse, places=2)

Exemple #13

0

Afficher le fichier

class BenchmarkLearner:

    @property
    def family(self) -> str:
        try:
            return self._learner.family
        except AttributeError:
            return self._learner.__class__.__name__

    @property
    def params(self) -> Dict[str, Any]:
        try:
            return self._learner.params
        except AttributeError:
            return {}

    @property
    def full_name(self) -> str:
        if len(self.params) > 0:
            return f"{self.family}({','.join(f'{k}={v}' for k,v in self.params.items())})"
        else:
            return self.family

    def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None:
        self._learner = learner
        self._random  = CobaRandom(seed)

    def init(self) -> None:
        try:
            self._learner.init()
        except AttributeError:
            pass

    def choose(self, key: Key, context: Context, actions: Sequence[Action]) -> Tuple[Choice, float]:
        p = self._learner.predict(key, context, actions)
        c = self._random.choice(list(range(len(actions))), p)

        return c, p[c]
    
    def learn(self, key: Key, context: Context, action: Action, reward: Reward, probability: float) -> None:
        self._learner.learn(key, context, action, reward, probability)

Exemple #14

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 n_exemplars: int = 10,
                 kernel: Literal['linear', 'polynomial',
                                 'exponential'] = 'exponential',
                 degree: int = 2,
                 gamma: float = 1,
                 seed: int = 1) -> None:
        """Instantiate a KernelSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            n_exemplars: The number of exemplar action, context pairs.
            kernel: The family of the kernel basis functions.
            degree: This argument is only relevant when using polynomial kernels.
            gamma: This argument is only relevant when using exponential kernels. 
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, n_exemplars, kernel, degree, gamma,
                      seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._n_exemplars = n_exemplars
        self._seed = seed
        self._kernel = kernel
        self._degree = degree
        self._gamma = gamma

        rng = CobaRandom(seed)

        #if there are no features then we are unable to define exemplars
        if n_action_features + n_context_features == 0: n_exemplars = 0

        feat_gen = lambda n: tuple(rng.gausses(n, 0, .75))
        one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions))

        self._exemplars = [[
            feat_gen(n_action_features + n_context_features)
            for _ in range(n_exemplars)
        ] for _ in range(1 if n_action_features else n_actions)]
        weight_count = n_actions if n_exemplars == 0 else n_exemplars
        self._weights = [1 - 2 * w for w in rng.randoms(weight_count)]

        self._bias = 0

        if kernel == 'polynomial':
            #this ensures the dot-product between F and an exemplar is in [0,upper_bound]
            #This ensures that higher-order polynomials will remain reasonably well behaved
            upper_bound = (1.5)**(1 / degree) - 1
            self._exemplars = [[[upper_bound * ee / sum(e) for ee in e]
                                for e in E] for E in self._exemplars]

        def context(index: int) -> Context:
            return feat_gen(n_context_features) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            return [feat_gen(n_action_features) for _ in range(n_actions)
                    ] if n_action_features else one_hot_acts

        def reward(index: int, context: Context, action: Action) -> float:

            if n_exemplars == 0:
                return self._bias + self._weights[action.index(1)]

            #handles None context
            context = context or []

            if n_action_features:
                f = list(context) + list(action)
                W = self._weights
                E = self._exemplars[0]
            else:
                f = list(context)
                W = self._weights
                E = self._exemplars[action.index(1)]

            if kernel == "linear":
                K = lambda x1, x2: self._linear_kernel(x1, x2)
            if kernel == "polynomial":
                K = lambda x1, x2: self._polynomial_kernel(
                    x1, x2, self._degree)
            if kernel == "exponential":
                K = lambda x1, x2: self._exponential_kernel(
                    x1, x2, self._gamma)

            return self._bias + sum([w * K(e, f) for w, e in zip(W, E)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._weights = [w / s for w in self._weights]

        super().__init__(n_interactions, context, actions, reward)

Exemple #15

0

Afficher le fichier

    def _process_chunk(self,
                       task_group: Iterable[BenchmarkTask]) -> Iterable[Any]:

        source_by_id = {t.src_id: t.simulation.source for t in task_group}
        filter_by_id = {t.sim_id: t.simulation.filter for t in task_group}

        srt_src = lambda t: t.src_id
        grp_src = lambda t: t.src_id
        srt_sim = lambda t: t.sim_id
        grp_sim = lambda t: t.sim_id

        with CobaConfig.Logger.log(f"Processing chunk..."):

            for src_id, tasks_by_src in groupby(sorted(task_group,
                                                       key=srt_src),
                                                key=grp_src):

                try:

                    with CobaConfig.Logger.time(
                            f"Creating source {src_id} from {source_by_id[src_id]}..."
                    ):
                        #Rhis is not ideal. I'm not sure how it should be improved and leaving this for now.
                        loaded_source = list(source_by_id[src_id].read())

                    for sim_id, tasks_by_src_sim in groupby(sorted(
                            tasks_by_src, key=srt_sim),
                                                            key=grp_sim):

                        tasks_by_src_sim_list = list(tasks_by_src_sim)
                        learner_ids = [t.lrn_id for t in tasks_by_src_sim_list]
                        learners = [t.learner for t in tasks_by_src_sim_list]
                        seeds = [t.seed for t in tasks_by_src_sim_list]

                        learner_ids.reverse()
                        learners.reverse()

                        with CobaConfig.Logger.time(
                                f"Creating simulation {sim_id} from source {src_id}..."
                        ):
                            interactions = filter_by_id[sim_id].filter(
                                loaded_source)

                        if not interactions:
                            CobaConfig.Logger.log(
                                f"Simulation {sim_id} has nothing to evaluate (likely due to `take` being larger than the simulation)."
                            )
                            continue

                        for index in sorted(range(len(learners)),
                                            reverse=True):

                            lrn_id = learner_ids[index]
                            learner = deepcopy(learners[index])
                            random = CobaRandom(seeds[index])

                            try:
                                with CobaConfig.Logger.time(
                                        f"Evaluating learner {lrn_id} on Simulation {sim_id}..."
                                ):

                                    row_data = defaultdict(list)

                                    for i, interaction in enumerate(
                                            interactions):
                                        probs = learner.predict(
                                            i, interaction.context,
                                            interaction.actions)

                                        assert abs(
                                            sum(probs) - 1
                                        ) < .0001, "The learner returned invalid proabilities for action choices."

                                        action = random.choice(
                                            interaction.actions, probs)
                                        reward = interaction.feedbacks[
                                            interaction.actions.index(action)]
                                        prob = probs[interaction.actions.index(
                                            action)]

                                        info = learner.learn(
                                            i, interaction.context, action,
                                            reward, prob) or {}

                                        for key, value in info.items() | {
                                            ('reward', reward)
                                        }:
                                            row_data[key].append(value)

                                    yield Transaction.interactions(
                                        sim_id, lrn_id, _packed=row_data)

                            except Exception as e:
                                CobaConfig.Logger.log_exception(e)

                            finally:
                                del learner_ids[index]
                                del learners[index]

                except Exception as e:
                    CobaConfig.Logger.log_exception(e)

Exemple #16

0

Afficher le fichier

Fichier : corral.py Projet : anrath/coba

class CorralLearner(Learner):
    """A meta-learner that takes a collection of learners and determines
    which is best in an environment.
    
    This is an implementation of the Agarwal et al. (2017) Corral algorithm
    and requires that the reward is always in [0,1].

    References:
        Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. 
        "Corralling a band of bandit algorithms." In Conference on Learning 
        Theory, pp. 12-38. PMLR, 2017.
    """

    def __init__(self, 
        learners: Sequence[Learner], 
        eta     : float = 0.075,
        T       : float = math.inf, 
        mode    : Literal["importance","rejection","off-policy"] ="importance", 
        seed    : int = 1) -> None:
        """Instantiate a CorralLearner.

        Args:
            learners: The collection of base learners.
            eta: The learning rate. This controls how quickly Corral picks a best base_learner. 
            T: The number of interactions expected during the learning process. A small T will cause
                the learning rate to shrink towards 0 quickly while a large value for T will cause the
                learning rate to shrink towards 0 slowly. A value of inf means that the learning rate
                will remain constant.
            mode: Determines the method with which feedback is provided to the base learners. The 
                original paper used importance sampling. We also support `off-policy` and `rejection`.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """
        if mode not in ["importance", "off-policy", "rejection"]:
            raise CobaException("The provided `mode` for CorralLearner was unrecognized.")

        self._base_learners = [ SafeLearner(learner) for learner in learners]

        M = len(self._base_learners)

        self._T     = T
        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._mode = mode

        self._random_pick   = CobaRandom(seed)
        self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000))

    @property
    def params(self) -> Dict[str, Any]:
        return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed }

    def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]:

        base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ]
        base_predicts, base_infos = zip(*base_predicts)

        if self._mode in ["importance"]:
            base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts              ]
            base_probs   = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ]

            predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ]
            info    = (base_actions, base_probs, base_infos, base_predicts, actions, predict)

        if self._mode in ["off-policy", "rejection"]:
            predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ]
            info    = (None, None, base_infos, base_predicts, actions, predict)

        return (predict, info)

    def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None:

        assert  0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1"

        base_actions = info[0]
        base_probs   = info[1]
        base_infos   = info[2]
        base_preds   = info[3]
        actions      = info[4]
        predict      = info[5]

        if self._mode == "importance":
            # This is what is in the original paper. It has the following characteristics:
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a reward estimator with higher variance and no bias (aka, importance sampling)
            #   > It is "on-policy" with respect to base learner's prediction distributions
            # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A]
            for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos):
                R = reward * int(A==action)/probability
                learner.learn(context, A, R, P, base_info)

        if self._mode == "off-policy":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward)
            #   > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts).
            for learner, base_info in zip(self._base_learners, base_infos):
                learner.learn(context, action, reward, probability, base_info)

        if self._mode == "rejection":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It doesn't necessarily provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback
            #   > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts).
            p = self._random_reject.random() #can I reuse this across all learners like this??? I think so???
            for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds):
                f = lambda a: base_predict[actions.index(a)] #the PMF we want
                g = lambda a: predict[actions.index(a)]      #the PMF we have
                
                M = max([f(A)/g(A) for A in actions if g(A) > 0])
                if p <= f(action)/(M*g(action)):
                    learner.learn(context, action, reward, f(action), base_info)

        # Instant loss is an unbiased estimate of E[loss|learner] for this iteration.
        # Our estimate differs from the orginal Corral paper because we have access to the
        # action probabilities of the base learners while the Corral paper did not assume 
        # access to this information. This information allows for a loss esimator with the same 
        # expectation as the original Corral paper's estimator but with a lower variance.

        loss = 1-reward

        picked_index = actions.index(action)
        instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ]
        self._ps     = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas)
        self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ]

        for i in range(len(self._base_learners)):
            if 1/self._p_bars[i] > self._rhos[i]:
                self._rhos[i] = 2/self._p_bars[i]
                self._etas[i] *= self._beta

        base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) }
        base_pbar_data    = { f"pbar_{i}"   : self._p_bars[i]             for i in range(len(self._base_learners)) }
        predict_data      = { "predict"     : probability, **base_predict_data, **base_pbar_data }

        InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data})

    @staticmethod
    def _log_barrier_omd(ps, losses, etas) -> Sequence[float]:

        f  = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)]))
        df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)]))

        denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ]

        min_loss = min(losses)
        max_loss = max(losses)

        precision = 4

        def binary_search(l,r) -> Optional[float]:
            #in theory the above check should guarantee this has a solution
            while True:

                x = (l+r)/2
                y = f(x)

                if round(y,precision) == 1:
                    return x

                if y < 1:
                    l = x

                if y > 1:
                    r = x

        def find_root_of_1():
            brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss]))))

            for l_brack, r_brack in zip(brackets[:-1], brackets[1:]):
                
                if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0:
                    continue
                else:
                    # we use binary search because newtons 
                    # method can overshoot our objective
                    return binary_search(l_brack, r_brack)

        lmbda: Optional[float] = None

        if min_loss == max_loss:
            lmbda = min_loss
        elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1:
            lmbda = min_loss
        elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1:
            lmbda = max_loss
        else:
            lmbda = find_root_of_1()

        if lmbda is None:
            raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}')

        new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)]

        assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral"

        return new_ps

Exemple #17

0

Afficher le fichier

    def test_cb_adf_learning(self):
        learner = VowpalArgsLearner()

        n_actions = 3
        n_features = 10
        n_examples = 2000

        rng = CobaRandom(11111)

        contexts = [rng.randoms(n_features) for _ in range(n_examples)]

        pre_learn_rewards = []
        for context in contexts[:int(.9 * n_examples)]:

            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            pre_learn_rewards.append(
                rng.choice(rewards,
                           learner.predict(context, actions)[0]))

        for context in contexts[:int(.9 * n_examples)]:

            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            probs, info = learner.predict(context, actions)
            choice = rng.choice(list(range(3)), probs)

            learner.learn(context, actions[choice], rewards[choice],
                          probs[choice], info)

        post_learn_rewards = []

        for context in contexts[int(.9 * n_examples):]:
            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            post_learn_rewards.append(
                rng.choice(rewards,
                           learner.predict(context, actions)[0]))

        average_pre_learn_reward = sum(pre_learn_rewards) / len(
            pre_learn_rewards)
        average_post_learn_reward = sum(post_learn_rewards) / len(
            post_learn_rewards)

        self.assertAlmostEqual(.33, average_pre_learn_reward, places=2)
        self.assertAlmostEqual(.78, average_post_learn_reward, places=2)

Exemple #18

0

Afficher le fichier

Fichier : corral.py Projet : VowpalWabbit/coba

class CorralLearner(Learner):
    """This is an implementation of the Agarwal et al. (2017) Corral algorithm.

    This algorithm assumes that the reward distribution has support in [0,1]
    and implements the remark on pg. 8 to improve learning efficiency when 
    multiple bandits select the same action.

    References:
        Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. 
        "Corralling a band of bandit algorithms." In Conference on Learning 
        Theory, pp. 12-38. PMLR, 2017.
    """

    def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None:
        """Instantiate a CorralLearner.
        
        Args:
            base_learners: The collection of algorithms to use as base learners.
            eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best.
            T: The number of interactions expected during the learning process. In our experiments 
                Corral performance seemed relatively insensitive to this value.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """

        self._base_learners = base_learners

        M = len(self._base_learners)

        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._random   = CobaRandom(seed)

        self._base_action_picks : Dict[Key, Sequence[Action]] = {}
        self._base_action_probs: Dict[Key, Sequence[float]]  = {}

    @property
    def family(self) -> str:
        """The family of the learner.

        See the base class for more information
        """
        return "corral"
    
    @property
    def params(self) -> Dict[str, Any]:
        """The parameters of the learner.

        See the base class for more information
        """
        return {"eta": self._eta_init, "B": [ b.family for b in self._base_learners ] }

    def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]:
        """Determine a PMF with which to select the given actions.

        Args:
            key: The key identifying the interaction we are choosing for.
            context: The context we're currently in. See the base class for more information.
            actions: The actions to choose from. See the base class for more information.

        Returns:
            The probability of taking each action. See the base class for more information.
        """
        
        base_predicts = [ base_algorithm.predict(key, context, actions) for base_algorithm in self._base_learners ]
        
        base_action_picks = [ self._random.choice(actions, predict) for predict in base_predicts                   ]
        base_action_probs = [ predict[actions.index(action)] for action,predict in zip(base_action_picks,base_predicts) ]

        self._base_action_picks[key] = base_action_picks
        self._base_action_probs[key] = base_action_probs

        return [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_action_picks)]) for a in actions ]

    def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None:
        """Learn from the given interaction.

        Args:
            key: The key identifying the interaction this observed reward came from.
            context: The context we're learning about. See the base class for more information.
            action: The action that was selected in the context. See the base class for more information.
            reward: The reward that was gained from the action. See the base class for more information.
            probability: The probability that the given action was taken.
        """

        loss = 1-reward

        assert  0 <= loss and loss <= 1, "The current Corral implementation assumes a loss between 0 and 1"

        base_action_picks = self._base_action_picks.pop(key)
        base_action_probs = self._base_action_probs.pop(key)

        losses  = [ loss/probability   * int(act==action) for act in base_action_picks ]
        rewards = [ reward/probability * int(act==action) for act in base_action_picks ]

        for learner, action, R, P in zip(self._base_learners, base_action_picks, rewards, base_action_probs):
            learner.learn(key, context, action, R, P) # COBA learners assume a reward

        self._ps     = list(self._log_barrier_omd(losses))
        self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ]

        for i in range(len(self._base_learners)):
            if 1/self._p_bars[i] > self._rhos[i]:
                self._rhos[i] = 2/self._p_bars[i]
                self._etas[i] *= self._beta

    def _log_barrier_omd(self, losses) -> Sequence[float]:

        f  = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(self._ps, self._etas, losses)]))
        df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(self._ps, self._etas, losses)]))

        denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(self._ps, self._etas, losses) ]

        min_loss = min(losses)
        max_loss = max(losses)

        precision = 4

        def newtons_zero(l,r) -> Optional[float]:
            """Use Newton's method to calculate the root."""
            
            #depending on scales this check may fail though that seems unlikely
            if (f(l+.0001)-1) * (f(r-.00001)-1) >= 0:
                return None

            i = 0
            x = (l+r)/2

            while True:
                i += 1

                if df(x) == 0:
                    raise Exception(f'Something went wrong in Corral (0) {self._ps}, {self._etas}, {losses}, {x}')

                x -= (f(x)-1)/df(x)

                if round(f(x),precision) == 1:
                    return x

                if (i % 30000) == 0:
                    print(i)

        lmbda: Optional[float] = None

        if min_loss == max_loss:
            lmbda = min_loss
        elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1:
            lmbda = min_loss
        elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1:
            lmbda = max_loss
        else:
            brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss]))))

            for l_brack, r_brack in zip(brackets[:-1], brackets[1:]):
                lmbda = newtons_zero(l_brack, r_brack)
                if lmbda is not None: break

        if lmbda is None:
            raise Exception(f'Something went wrong in Corral (None) {self._ps}, {self._etas}, {losses}')

        return [ max(1/((1/p) + eta*(loss-lmbda)),.00001) for p, eta, loss in zip(self._ps, self._etas, losses)]

Exemple #19

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 reward_features: Sequence[str] = ["a", "xa"],
                 seed: int = 1) -> None:
        """Instantiate a LinearSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            reward_features: The features in the simulation's linear reward function.
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, reward_features, seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._reward_features = reward_features
        self._seed = seed

        if not self._n_context_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('x', '') for f in reward_features])))

        if not self._n_action_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('a', '') for f in reward_features])))

        rng = CobaRandom(seed)
        feat_encoder = InteractionsEncoder(reward_features)

        #to try and make sure high-order polynomials are well behaved
        #we center our context and action features on 1 and give them
        #a very small amount of variance. Then, in post processing, we
        #shift and re-scale our reward to center and fill in [0,1].
        max_degree = max([len(f)
                          for f in reward_features]) if reward_features else 1
        feat_gen = lambda n: tuple([
            g * rng.choice([1, -1])
            for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree))
        ])
        one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions))

        feature_count = len(
            feat_encoder.encode(x=[1] * n_context_features,
                                a=[1] * n_action_features))
        weight_parts = 1 if n_action_features else n_actions
        weight_count = 1 if feature_count == 0 else feature_count

        self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)]
                         for _ in range(weight_parts)]

        self._bias = 0
        self._clip = False

        def context(index: int) -> Context:
            return feat_gen(n_context_features) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            return [feat_gen(n_action_features) for _ in range(n_actions)
                    ] if n_action_features else one_hot_acts

        def reward(index: int, context: Context, action: Action) -> float:

            F = feat_encoder.encode(x=context, a=action) or [1]
            W = self._weights[0 if n_action_features else action.index(1)]

            return self._bias + sum([w * f for w, f in zip(W, F)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._weights = [[w / s for w in W] for W in self._weights]
        self._clip = True

        super().__init__(n_interactions, context, actions, reward)

Exemple #20

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 n_neighborhoods: int = 10,
                 seed: int = 1) -> None:
        """Instantiate a NeighborsSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            n_neighborhoods: The number of neighborhoods the simulation should have.
            seed: The random number seed used to generate all contexts and action rewards.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, n_neighborhoods, seed)

        self._n_interactions = n_interactions
        self._n_actions = n_actions
        self._n_context_feats = n_context_features
        self._n_action_feats = n_action_features
        self._n_neighborhoods = n_neighborhoods
        self._seed = seed

        rng = CobaRandom(self._seed)

        def context_gen():
            return tuple(rng.gausses(n_context_features, 0,
                                     1)) if n_context_features else None

        def actions_gen():
            if not n_action_features:
                return OneHotEncoder().fit_encodes(range(n_actions))
            else:
                return [
                    tuple(rng.gausses(n_action_features, 0, 1))
                    for _ in range(n_actions)
                ]

        contexts = list(
            set([context_gen() for _ in range(self._n_neighborhoods)]))
        context_actions = {c: actions_gen() for c in contexts}
        context_action_rewards = {(c, a): rng.random()
                                  for c in contexts
                                  for a in context_actions[c]}

        context_iter = iter(islice(cycle(contexts), n_interactions))

        def context(index: int):
            return next(context_iter)

        def actions(index: int, context: Tuple[float, ...]):
            return context_actions[context]

        def reward(index: int, context: Tuple[float, ...], action: Tuple[int,
                                                                         ...]):
            return context_action_rewards[(context, action)]

        return super().__init__(self._n_interactions, context, actions, reward)

Exemple #21

0

Afficher le fichier

 def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None:
     self._learner = learner
     self._random  = CobaRandom(seed)

Exemple #22

0

Afficher le fichier

    def __init__(self,
                 n_interactions: int = 500,
                 n_actions: int = 10,
                 n_features: int = 10,
                 context_features: bool = True,
                 action_features: bool = True,
                 sparse: bool = False,
                 seed: int = 1) -> None:

        self._n_bandits = n_actions
        self._n_features = n_features
        self._context_features = context_features
        self._action_features = action_features
        self._seed = seed

        r = CobaRandom(seed)

        context: Callable[[int], Context]
        actions: Callable[[int, Context], Sequence[Action]]
        rewards: Callable[[int, Context, Action], float]

        sparsify = lambda x: (tuple(range(len(x))), tuple(x)
                              ) if sparse else tuple(x)
        unsparse = lambda x: x[1] if sparse else x
        normalize = lambda X: [x / sum(X) for x in X]

        if not context_features and not action_features:

            means = [
                m / n_actions + 1 / (2 * n_actions)
                for m in r.randoms(n_actions)
            ]

            actions_features = []
            for i in range(n_actions):
                action = [0] * n_actions
                action[i] = 1
                actions_features.append(tuple(action))

            context = lambda i: None
            actions = lambda i, c: sparsify(actions_features)
            rewards = lambda i, c, a: means[unsparse(a).index(1)] + (r.random(
            ) - .5) / n_actions

        if context_features and not action_features:
            #normalizing allows us to make sure our reward is in [0,1]
            bandit_thetas = [r.randoms(n_features) for _ in range(n_actions)]
            theta_totals = [sum(theta) for theta in bandit_thetas]
            bandit_thetas = [[
                t / norm for t in theta
            ] for theta, norm in zip(bandit_thetas, theta_totals)]

            actions_features = []
            for i in range(n_actions):
                action = [0] * n_actions
                action[i] = 1
                actions_features.append(tuple(action))

            context = lambda i: sparsify(r.randoms(n_features))
            actions = lambda i, c: [sparsify(af) for af in actions_features]
            rewards = lambda i, c, a: sum([
                cc * t for cc, t in zip(unsparse(c), bandit_thetas[unsparse(a).
                                                                   index(1)])
            ])

        if not context_features and action_features:

            theta = r.randoms(n_features)

            context = lambda i: None
            actions = lambda i, c: [
                sparsify(normalize(r.randoms(n_features)))
                for _ in range(r.randint(2, 10))
            ]
            rewards = lambda i, c, a: float(
                sum([cc * t for cc, t in zip(theta, unsparse(a))]))

        if context_features and action_features:

            context = lambda i: sparsify(r.randoms(n_features))
            actions = lambda i, c: [
                sparsify(normalize(r.randoms(n_features)))
                for _ in range(r.randint(2, 10))
            ]
            rewards = lambda i, c, a: sum(
                [cc * t for cc, t in zip(unsparse(c), unsparse(a))])

        super().__init__(n_interactions, context, actions, rewards)

Exemple #23

0

Afficher le fichier

Fichier : filters.py Projet : anrath/coba

 def filter(self, items: Iterable[Any]) -> Sequence[Any]:
     return CobaRandom(self._seed).shuffle(list(items))

Exemple #24

0

Afficher le fichier

 def filter(self,
            interactions: Iterable[Interaction]) -> Iterable[Interaction]:
     return CobaRandom(self._seed).shuffle(list(interactions))

Exemple #25

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 seed: int = 1) -> None:
        """Instantiate an MLPSythenticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._seed = seed

        rng = CobaRandom(seed)

        input_layer_size = n_context_features + n_action_features
        hidden_layer_size = 50

        self._bias = 0

        if input_layer_size:
            hidden_weights = [[
                rng.gausses(input_layer_size, 0, 1.5)
                for _ in range(hidden_layer_size)
            ] for _ in range(1 if n_action_features else n_actions)]
            hidden_activation = lambda x: 1 / (1 + math.exp(-x)
                                               )  #sigmoid activation
            hidden_output = lambda inputs, weights: hidden_activation(
                sum([i * w for i, w in zip(inputs, weights)]))
            self._output_weights = rng.gausses(hidden_layer_size)
        else:
            self._output_weights = rng.gausses(n_actions)

        def context(index: int) -> Context:
            return tuple(rng.gausses(
                n_context_features)) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            if n_action_features:
                return [(rng.gausses(n_action_features))
                        for _ in range(n_actions)]
            else:
                return OneHotEncoder().fit_encodes(range(n_actions))

        def reward(index: int, context: Context, action: Action) -> float:

            #handles None context
            context = context or []

            if not n_action_features and not n_context_features:
                return self._bias + self._output_weights[action.index(1)]

            if n_action_features:
                I = list(context) + list(action)
                W = self._output_weights
                H = hidden_weights[0]
            else:
                I = list(context)
                W = self._output_weights
                H = hidden_weights[action.index(1)]

            hidden_outputs = [hidden_output(I, h) for h in H]

            return self._bias + sum(
                [w * hout for w, hout in zip(W, hidden_outputs)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._output_weights = [w / s for w in self._output_weights]

        super().__init__(n_interactions, context, actions, reward)