Python CobaRandom.choice Exemples, coba.random.CobaRandom.choice Python Exemples

Exemple #1

0

Afficher le fichier

class Warm(EnvironmentFilter):
    """Turn a SimulatedEnvironment into a WarmStartEnvironment."""
    def __init__(self, n_warm: int, seed: int = 1):
        """Instantiate a Warm filter.

        Args:
            n_warm: The number of interactions that should be turned into LoggedInteractions.
            seed: The random number seed that determines the random logging policy for LoggedInteractions.
        """
        self._n_warm = n_warm
        self._seed = seed

    @property
    def params(self) -> Dict[str, Any]:
        return {"n_warm": self._n_warm}

    def filter(
            self, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[Interaction]:

        self._rng = CobaRandom(self._seed)

        underlying_iterable = iter(interactions)
        logged_interactions = map(self._to_logged_interaction,
                                  islice(underlying_iterable, self._n_warm))
        simulated_interactions = underlying_iterable

        return chain(logged_interactions, simulated_interactions)

    def _to_logged_interaction(
            self, interaction: SimulatedInteraction) -> LoggedInteraction:
        num_actions = len(interaction.actions)
        probabilities = [1 / num_actions] * num_actions

        selected_index = self._rng.choice(list(range(num_actions)),
                                          probabilities)
        selected_action = interaction.actions[selected_index]
        selected_probability = probabilities[selected_index]

        kwargs = {
            "probability": selected_probability,
            "actions": interaction.actions
        }

        if "reveals" in interaction.kwargs:
            kwargs["reveal"] = interaction.kwargs["reveals"][selected_index]

        if "rewards" in interaction.kwargs:
            kwargs["reward"] = interaction.kwargs["rewards"][selected_index]

        return LoggedInteraction(interaction.context, selected_action,
                                 **kwargs)

Exemple #2

0

Afficher le fichier

Fichier : tasks.py Projet : anrath/coba

    def process(
        self, learner: Learner, interactions: Iterable[SimulatedInteraction]
    ) -> Iterable[Dict[Any, Any]]:

        random = CobaRandom(self._seed)

        if not isinstance(learner, SafeLearner): learner = SafeLearner(learner)
        if not interactions: return

        for interaction in interactions:

            InteractionContext.learner_info.clear()

            context = interaction.context
            actions = interaction.actions

            start_time = time.time()
            probs, info = learner.predict(context, actions)
            predict_time = time.time() - start_time

            action = random.choice(actions, probs)
            reveal = interaction.kwargs.get(
                "reveals",
                interaction.kwargs.get("rewards"))[actions.index(action)]
            prob = probs[actions.index(action)]

            start_time = time.time()
            learner.learn(context, action, reveal, prob, info)
            learn_time = time.time() - start_time

            learner_info = InteractionContext.learner_info
            interaction_info = {}

            for k, v in interaction.kwargs.items():
                if isinstance(v, collections.abc.Sequence) and not isinstance(
                        v, str):
                    interaction_info[k] = v[actions.index(action)]
                else:
                    interaction_info[k] = v

            time_info = {
                "predict_time": predict_time,
                "learn_time": learn_time
            } if self._time else {}

            yield {**interaction_info, **learner_info, **time_info}

Exemple #3

0

Afficher le fichier

class BenchmarkLearner:

    @property
    def family(self) -> str:
        try:
            return self._learner.family
        except AttributeError:
            return self._learner.__class__.__name__

    @property
    def params(self) -> Dict[str, Any]:
        try:
            return self._learner.params
        except AttributeError:
            return {}

    @property
    def full_name(self) -> str:
        if len(self.params) > 0:
            return f"{self.family}({','.join(f'{k}={v}' for k,v in self.params.items())})"
        else:
            return self.family

    def __init__(self, learner: Learner[Context,Action], seed: Optional[int]) -> None:
        self._learner = learner
        self._random  = CobaRandom(seed)

    def init(self) -> None:
        try:
            self._learner.init()
        except AttributeError:
            pass

    def choose(self, key: Key, context: Context, actions: Sequence[Action]) -> Tuple[Choice, float]:
        p = self._learner.predict(key, context, actions)
        c = self._random.choice(list(range(len(actions))), p)

        return c, p[c]
    
    def learn(self, key: Key, context: Context, action: Action, reward: Reward, probability: float) -> None:
        self._learner.learn(key, context, action, reward, probability)

Exemple #4

0

Afficher le fichier

Fichier : test_learners_corral.py Projet : anrath/coba

    def test_rejection_learn(self):

        actions = [0, 1]
        base1 = ReceivedLearnFixedLearner([1 / 2, 1 / 2], 'a')
        base2 = ReceivedLearnFixedLearner([1 / 4, 3 / 4], 'b')
        learner = CorralLearner([base1, base2], eta=0.5, mode="rejection")
        predict, info = learner.predict(None, actions)

        action = actions[0]
        probability = predict[0]
        reward = 1

        base1_learn_cnt = [0, 0]
        base2_learn_cnt = [0, 0]

        random = CobaRandom(1)

        for _ in range(1000):

            action = random.choice(actions, predict)
            probability = predict[actions.index(action)]

            learner.learn(None, action, reward, probability, info)
            base1_learn_cnt[action] += int(base1.received_learn is not None)
            base2_learn_cnt[action] += int(base2.received_learn is not None)

            base1.received_learn = None
            base2.received_learn = None

        self.assertLessEqual(
            abs(base1_learn_cnt[0] / sum(base1_learn_cnt) - 1 / 2), .02)
        self.assertLessEqual(
            abs(base1_learn_cnt[1] / sum(base1_learn_cnt) - 1 / 2), .02)

        self.assertLessEqual(
            abs(base2_learn_cnt[0] / sum(base2_learn_cnt) - 1 / 4), .02)
        self.assertLessEqual(
            abs(base2_learn_cnt[1] / sum(base2_learn_cnt) - 3 / 4), .02)

Exemple #5

0

Afficher le fichier

Fichier : corral.py Projet : VowpalWabbit/coba

class CorralLearner(Learner):
    """This is an implementation of the Agarwal et al. (2017) Corral algorithm.

    This algorithm assumes that the reward distribution has support in [0,1]
    and implements the remark on pg. 8 to improve learning efficiency when 
    multiple bandits select the same action.

    References:
        Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. 
        "Corralling a band of bandit algorithms." In Conference on Learning 
        Theory, pp. 12-38. PMLR, 2017.
    """

    def __init__(self, base_learners: Sequence[Learner], eta: float, T: float = math.inf, seed: int = None) -> None:
        """Instantiate a CorralLearner.
        
        Args:
            base_learners: The collection of algorithms to use as base learners.
            eta: The learning rate. In our experiments a value between 0.05 and .10 often seemed best.
            T: The number of interactions expected during the learning process. In our experiments 
                Corral performance seemed relatively insensitive to this value.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """

        self._base_learners = base_learners

        M = len(self._base_learners)

        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._random   = CobaRandom(seed)

        self._base_action_picks : Dict[Key, Sequence[Action]] = {}
        self._base_action_probs: Dict[Key, Sequence[float]]  = {}

    @property
    def family(self) -> str:
        """The family of the learner.

        See the base class for more information
        """
        return "corral"
    
    @property
    def params(self) -> Dict[str, Any]:
        """The parameters of the learner.

        See the base class for more information
        """
        return {"eta": self._eta_init, "B": [ b.family for b in self._base_learners ] }

    def predict(self, key: Key, context: Context, actions: Sequence[Action]) -> Sequence[float]:
        """Determine a PMF with which to select the given actions.

        Args:
            key: The key identifying the interaction we are choosing for.
            context: The context we're currently in. See the base class for more information.
            actions: The actions to choose from. See the base class for more information.

        Returns:
            The probability of taking each action. See the base class for more information.
        """
        
        base_predicts = [ base_algorithm.predict(key, context, actions) for base_algorithm in self._base_learners ]
        
        base_action_picks = [ self._random.choice(actions, predict) for predict in base_predicts                   ]
        base_action_probs = [ predict[actions.index(action)] for action,predict in zip(base_action_picks,base_predicts) ]

        self._base_action_picks[key] = base_action_picks
        self._base_action_probs[key] = base_action_probs

        return [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_action_picks)]) for a in actions ]

    def learn(self, key: Key, context: Context, action: Action, reward: float, probability: float) -> None:
        """Learn from the given interaction.

        Args:
            key: The key identifying the interaction this observed reward came from.
            context: The context we're learning about. See the base class for more information.
            action: The action that was selected in the context. See the base class for more information.
            reward: The reward that was gained from the action. See the base class for more information.
            probability: The probability that the given action was taken.
        """

        loss = 1-reward

        assert  0 <= loss and loss <= 1, "The current Corral implementation assumes a loss between 0 and 1"

        base_action_picks = self._base_action_picks.pop(key)
        base_action_probs = self._base_action_probs.pop(key)

        losses  = [ loss/probability   * int(act==action) for act in base_action_picks ]
        rewards = [ reward/probability * int(act==action) for act in base_action_picks ]

        for learner, action, R, P in zip(self._base_learners, base_action_picks, rewards, base_action_probs):
            learner.learn(key, context, action, R, P) # COBA learners assume a reward

        self._ps     = list(self._log_barrier_omd(losses))
        self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ]

        for i in range(len(self._base_learners)):
            if 1/self._p_bars[i] > self._rhos[i]:
                self._rhos[i] = 2/self._p_bars[i]
                self._etas[i] *= self._beta

    def _log_barrier_omd(self, losses) -> Sequence[float]:

        f  = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(self._ps, self._etas, losses)]))
        df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(self._ps, self._etas, losses)]))

        denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(self._ps, self._etas, losses) ]

        min_loss = min(losses)
        max_loss = max(losses)

        precision = 4

        def newtons_zero(l,r) -> Optional[float]:
            """Use Newton's method to calculate the root."""
            
            #depending on scales this check may fail though that seems unlikely
            if (f(l+.0001)-1) * (f(r-.00001)-1) >= 0:
                return None

            i = 0
            x = (l+r)/2

            while True:
                i += 1

                if df(x) == 0:
                    raise Exception(f'Something went wrong in Corral (0) {self._ps}, {self._etas}, {losses}, {x}')

                x -= (f(x)-1)/df(x)

                if round(f(x),precision) == 1:
                    return x

                if (i % 30000) == 0:
                    print(i)

        lmbda: Optional[float] = None

        if min_loss == max_loss:
            lmbda = min_loss
        elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1:
            lmbda = min_loss
        elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1:
            lmbda = max_loss
        else:
            brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss]))))

            for l_brack, r_brack in zip(brackets[:-1], brackets[1:]):
                lmbda = newtons_zero(l_brack, r_brack)
                if lmbda is not None: break

        if lmbda is None:
            raise Exception(f'Something went wrong in Corral (None) {self._ps}, {self._etas}, {losses}')

        return [ max(1/((1/p) + eta*(loss-lmbda)),.00001) for p, eta, loss in zip(self._ps, self._etas, losses)]

Exemple #6

0

Afficher le fichier

Fichier : synthetics.py Projet : anrath/coba

    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 reward_features: Sequence[str] = ["a", "xa"],
                 seed: int = 1) -> None:
        """Instantiate a LinearSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            reward_features: The features in the simulation's linear reward function.
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, reward_features, seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._reward_features = reward_features
        self._seed = seed

        if not self._n_context_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('x', '') for f in reward_features])))

        if not self._n_action_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('a', '') for f in reward_features])))

        rng = CobaRandom(seed)
        feat_encoder = InteractionsEncoder(reward_features)

        #to try and make sure high-order polynomials are well behaved
        #we center our context and action features on 1 and give them
        #a very small amount of variance. Then, in post processing, we
        #shift and re-scale our reward to center and fill in [0,1].
        max_degree = max([len(f)
                          for f in reward_features]) if reward_features else 1
        feat_gen = lambda n: tuple([
            g * rng.choice([1, -1])
            for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree))
        ])
        one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions))

        feature_count = len(
            feat_encoder.encode(x=[1] * n_context_features,
                                a=[1] * n_action_features))
        weight_parts = 1 if n_action_features else n_actions
        weight_count = 1 if feature_count == 0 else feature_count

        self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)]
                         for _ in range(weight_parts)]

        self._bias = 0
        self._clip = False

        def context(index: int) -> Context:
            return feat_gen(n_context_features) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            return [feat_gen(n_action_features) for _ in range(n_actions)
                    ] if n_action_features else one_hot_acts

        def reward(index: int, context: Context, action: Action) -> float:

            F = feat_encoder.encode(x=context, a=action) or [1]
            W = self._weights[0 if n_action_features else action.index(1)]

            return self._bias + sum([w * f for w, f in zip(W, F)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._weights = [[w / s for w in W] for W in self._weights]
        self._clip = True

        super().__init__(n_interactions, context, actions, reward)

Exemple #7

0

Afficher le fichier

Fichier : corral.py Projet : anrath/coba

class CorralLearner(Learner):
    """A meta-learner that takes a collection of learners and determines
    which is best in an environment.
    
    This is an implementation of the Agarwal et al. (2017) Corral algorithm
    and requires that the reward is always in [0,1].

    References:
        Agarwal, Alekh, Haipeng Luo, Behnam Neyshabur, and Robert E. Schapire. 
        "Corralling a band of bandit algorithms." In Conference on Learning 
        Theory, pp. 12-38. PMLR, 2017.
    """

    def __init__(self, 
        learners: Sequence[Learner], 
        eta     : float = 0.075,
        T       : float = math.inf, 
        mode    : Literal["importance","rejection","off-policy"] ="importance", 
        seed    : int = 1) -> None:
        """Instantiate a CorralLearner.

        Args:
            learners: The collection of base learners.
            eta: The learning rate. This controls how quickly Corral picks a best base_learner. 
            T: The number of interactions expected during the learning process. A small T will cause
                the learning rate to shrink towards 0 quickly while a large value for T will cause the
                learning rate to shrink towards 0 slowly. A value of inf means that the learning rate
                will remain constant.
            mode: Determines the method with which feedback is provided to the base learners. The 
                original paper used importance sampling. We also support `off-policy` and `rejection`.
            seed: A seed for a random number generation in ordre to get repeatable results.
        """
        if mode not in ["importance", "off-policy", "rejection"]:
            raise CobaException("The provided `mode` for CorralLearner was unrecognized.")

        self._base_learners = [ SafeLearner(learner) for learner in learners]

        M = len(self._base_learners)

        self._T     = T
        self._gamma = 1/T
        self._beta  = 1/math.exp(1/math.log(T))

        self._eta_init = eta
        self._etas     = [ eta ] * M
        self._rhos     = [ float(2*M) ] * M
        self._ps       = [ 1/M ] * M
        self._p_bars   = [ 1/M ] * M

        self._mode = mode

        self._random_pick   = CobaRandom(seed)
        self._random_reject = CobaRandom(CobaRandom(seed).randint(0,10000))

    @property
    def params(self) -> Dict[str, Any]:
        return { "family": "corral", "eta": self._eta_init, "mode":self._mode, "T": self._T, "B": [ str(b) for b in self._base_learners ], "seed":self._random_pick._seed }

    def predict(self, context: Context, actions: Sequence[Action]) -> Tuple[Probs, Info]:

        base_predicts = [ base_algorithm.predict(context, actions) for base_algorithm in self._base_learners ]
        base_predicts, base_infos = zip(*base_predicts)

        if self._mode in ["importance"]:
            base_actions = [ self._random_pick.choice(actions, predict) for predict in base_predicts              ]
            base_probs   = [ predict[actions.index(action)] for action,predict in zip(base_actions,base_predicts) ]

            predict = [ sum([p_b*int(a==b_a) for p_b,b_a in zip(self._p_bars, base_actions)]) for a in actions ]
            info    = (base_actions, base_probs, base_infos, base_predicts, actions, predict)

        if self._mode in ["off-policy", "rejection"]:
            predict = [ sum([p_b*b_p[i] for p_b,b_p in zip(self._p_bars, base_predicts)]) for i in range(len(actions)) ]
            info    = (None, None, base_infos, base_predicts, actions, predict)

        return (predict, info)

    def learn(self, context: Context, action: Action, reward: float, probability:float, info: Info) -> None:

        assert  0 <= reward and reward <= 1, "This Corral implementation assumes a loss between 0 and 1"

        base_actions = info[0]
        base_probs   = info[1]
        base_infos   = info[2]
        base_preds   = info[3]
        actions      = info[4]
        predict      = info[5]

        if self._mode == "importance":
            # This is what is in the original paper. It has the following characteristics:
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a reward estimator with higher variance and no bias (aka, importance sampling)
            #   > It is "on-policy" with respect to base learner's prediction distributions
            # The reward, R, supplied to the base learners satisifies E[R|context,A] = E[reward|context,A]
            for learner, A, P, base_info in zip(self._base_learners, base_actions, base_probs, base_infos):
                R = reward * int(A==action)/probability
                learner.learn(context, A, R, P, base_info)

        if self._mode == "off-policy":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It is able to provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward)
            #   > It is "off-policy" (i.e., base learners receive action feedback distributed differently from their predicts).
            for learner, base_info in zip(self._base_learners, base_infos):
                learner.learn(context, action, reward, probability, base_info)

        if self._mode == "rejection":
            # An alternative variation to the paper is provided below. It has the following characterisitcs: 
            #   > It doesn't necessarily provide feedback to every base learner on every iteration
            #   > It uses a MVUB reward estimator (aka, the unmodified, observed reward) when it does provide feedback
            #   > It is "on-policy" (i.e., base learners receive action feedback is distributed identically to their predicts).
            p = self._random_reject.random() #can I reuse this across all learners like this??? I think so???
            for learner, base_info, base_predict in zip(self._base_learners, base_infos, base_preds):
                f = lambda a: base_predict[actions.index(a)] #the PMF we want
                g = lambda a: predict[actions.index(a)]      #the PMF we have
                
                M = max([f(A)/g(A) for A in actions if g(A) > 0])
                if p <= f(action)/(M*g(action)):
                    learner.learn(context, action, reward, f(action), base_info)

        # Instant loss is an unbiased estimate of E[loss|learner] for this iteration.
        # Our estimate differs from the orginal Corral paper because we have access to the
        # action probabilities of the base learners while the Corral paper did not assume 
        # access to this information. This information allows for a loss esimator with the same 
        # expectation as the original Corral paper's estimator but with a lower variance.

        loss = 1-reward

        picked_index = actions.index(action)
        instant_loss = [ loss * base_pred[picked_index]/probability for base_pred in base_preds ]
        self._ps     = CorralLearner._log_barrier_omd(self._ps, instant_loss, self._etas)
        self._p_bars = [ (1-self._gamma)*p + self._gamma*1/len(self._base_learners) for p in self._ps ]

        for i in range(len(self._base_learners)):
            if 1/self._p_bars[i] > self._rhos[i]:
                self._rhos[i] = 2/self._p_bars[i]
                self._etas[i] *= self._beta

        base_predict_data = { f"predict_{i}": base_preds[i][picked_index] for i in range(len(self._base_learners)) }
        base_pbar_data    = { f"pbar_{i}"   : self._p_bars[i]             for i in range(len(self._base_learners)) }
        predict_data      = { "predict"     : probability, **base_predict_data, **base_pbar_data }

        InteractionContext.learner_info.update({**predict_data, **base_predict_data, **base_pbar_data})

    @staticmethod
    def _log_barrier_omd(ps, losses, etas) -> Sequence[float]:

        f  = lambda l: float(sum( [ 1/((1/p) + eta*(loss-l)) for p, eta, loss in zip(ps, etas, losses)]))
        df = lambda l: float(sum( [ eta/((1/p) + eta*(loss-l))**2 for p, eta, loss in zip(ps, etas, losses)]))

        denom_zeros = [ ((-1/p)-(eta*loss))/-eta for p, eta, loss in zip(ps, etas, losses) ]

        min_loss = min(losses)
        max_loss = max(losses)

        precision = 4

        def binary_search(l,r) -> Optional[float]:
            #in theory the above check should guarantee this has a solution
            while True:

                x = (l+r)/2
                y = f(x)

                if round(y,precision) == 1:
                    return x

                if y < 1:
                    l = x

                if y > 1:
                    r = x

        def find_root_of_1():
            brackets = list(sorted(filter(lambda z: min_loss <= z and z <= max_loss, set(denom_zeros + [min_loss, max_loss]))))

            for l_brack, r_brack in zip(brackets[:-1], brackets[1:]):
                
                if (f(l_brack+.00001)-1) * (f(r_brack-.00001)-1) >= 0:
                    continue
                else:
                    # we use binary search because newtons 
                    # method can overshoot our objective
                    return binary_search(l_brack, r_brack)

        lmbda: Optional[float] = None

        if min_loss == max_loss:
            lmbda = min_loss
        elif min_loss not in denom_zeros and round(f(min_loss),precision) == 1:
            lmbda = min_loss
        elif max_loss not in denom_zeros and round(f(max_loss),precision) == 1:
            lmbda = max_loss
        else:
            lmbda = find_root_of_1()

        if lmbda is None:
            raise Exception(f'Something went wrong in Corral OMD {ps}, {etas}, {losses}')

        new_ps = [ 1/((1/p) + eta*(loss-lmbda)) for p, eta, loss in zip(ps, etas, losses)]

        assert round(sum(new_ps),precision) == 1, "An invalid update was made by the log barrier in Corral"

        return new_ps

Exemple #8

0

Afficher le fichier

    def _process_chunk(self,
                       task_group: Iterable[BenchmarkTask]) -> Iterable[Any]:

        source_by_id = {t.src_id: t.simulation.source for t in task_group}
        filter_by_id = {t.sim_id: t.simulation.filter for t in task_group}

        srt_src = lambda t: t.src_id
        grp_src = lambda t: t.src_id
        srt_sim = lambda t: t.sim_id
        grp_sim = lambda t: t.sim_id

        with CobaConfig.Logger.log(f"Processing chunk..."):

            for src_id, tasks_by_src in groupby(sorted(task_group,
                                                       key=srt_src),
                                                key=grp_src):

                try:

                    with CobaConfig.Logger.time(
                            f"Creating source {src_id} from {source_by_id[src_id]}..."
                    ):
                        #Rhis is not ideal. I'm not sure how it should be improved and leaving this for now.
                        loaded_source = list(source_by_id[src_id].read())

                    for sim_id, tasks_by_src_sim in groupby(sorted(
                            tasks_by_src, key=srt_sim),
                                                            key=grp_sim):

                        tasks_by_src_sim_list = list(tasks_by_src_sim)
                        learner_ids = [t.lrn_id for t in tasks_by_src_sim_list]
                        learners = [t.learner for t in tasks_by_src_sim_list]
                        seeds = [t.seed for t in tasks_by_src_sim_list]

                        learner_ids.reverse()
                        learners.reverse()

                        with CobaConfig.Logger.time(
                                f"Creating simulation {sim_id} from source {src_id}..."
                        ):
                            interactions = filter_by_id[sim_id].filter(
                                loaded_source)

                        if not interactions:
                            CobaConfig.Logger.log(
                                f"Simulation {sim_id} has nothing to evaluate (likely due to `take` being larger than the simulation)."
                            )
                            continue

                        for index in sorted(range(len(learners)),
                                            reverse=True):

                            lrn_id = learner_ids[index]
                            learner = deepcopy(learners[index])
                            random = CobaRandom(seeds[index])

                            try:
                                with CobaConfig.Logger.time(
                                        f"Evaluating learner {lrn_id} on Simulation {sim_id}..."
                                ):

                                    row_data = defaultdict(list)

                                    for i, interaction in enumerate(
                                            interactions):
                                        probs = learner.predict(
                                            i, interaction.context,
                                            interaction.actions)

                                        assert abs(
                                            sum(probs) - 1
                                        ) < .0001, "The learner returned invalid proabilities for action choices."

                                        action = random.choice(
                                            interaction.actions, probs)
                                        reward = interaction.feedbacks[
                                            interaction.actions.index(action)]
                                        prob = probs[interaction.actions.index(
                                            action)]

                                        info = learner.learn(
                                            i, interaction.context, action,
                                            reward, prob) or {}

                                        for key, value in info.items() | {
                                            ('reward', reward)
                                        }:
                                            row_data[key].append(value)

                                    yield Transaction.interactions(
                                        sim_id, lrn_id, _packed=row_data)

                            except Exception as e:
                                CobaConfig.Logger.log_exception(e)

                            finally:
                                del learner_ids[index]
                                del learners[index]

                except Exception as e:
                    CobaConfig.Logger.log_exception(e)

Exemple #9

0

Afficher le fichier

    def test_cb_adf_learning(self):
        learner = VowpalArgsLearner()

        n_actions = 3
        n_features = 10
        n_examples = 2000

        rng = CobaRandom(11111)

        contexts = [rng.randoms(n_features) for _ in range(n_examples)]

        pre_learn_rewards = []
        for context in contexts[:int(.9 * n_examples)]:

            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            pre_learn_rewards.append(
                rng.choice(rewards,
                           learner.predict(context, actions)[0]))

        for context in contexts[:int(.9 * n_examples)]:

            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            probs, info = learner.predict(context, actions)
            choice = rng.choice(list(range(3)), probs)

            learner.learn(context, actions[choice], rewards[choice],
                          probs[choice], info)

        post_learn_rewards = []

        for context in contexts[int(.9 * n_examples):]:
            actions = [rng.randoms(n_features) for _ in range(n_actions)]
            rewards = [
                sum([a * c for a, c in zip(action, context)])
                for action in actions
            ]
            rewards = [int(r == max(rewards)) for r in rewards]

            post_learn_rewards.append(
                rng.choice(rewards,
                           learner.predict(context, actions)[0]))

        average_pre_learn_reward = sum(pre_learn_rewards) / len(
            pre_learn_rewards)
        average_post_learn_reward = sum(post_learn_rewards) / len(
            post_learn_rewards)

        self.assertAlmostEqual(.33, average_pre_learn_reward, places=2)
        self.assertAlmostEqual(.78, average_post_learn_reward, places=2)