def update_optimal_actions(self):
        """
        :return: dictionary of informations about optimal action
        for each posterior sample of the model parameters
        # keys: actions = sorted tuple of items to propose in the assortment
        # values: (p(action = a*),
        [thetas such that action is optimal for theta]
        """
        posteriors_actions = act_optimally(self.posterior_belief,
                                           self.assortment_size)
        posteriors_actions = [
            tuple(posteriors_actions[ix, :]) for ix in range(self.n_samples)
        ]
        optimal_actions_information = defaultdict(list)
        for ix, action in enumerate(posteriors_actions):
            optimal_actions_information[action].append(ix)

        self.optimal_actions = {
            action: (len(theta_idxs) / self.n_samples, theta_idxs)
            for action, theta_idxs in optimal_actions_information.items()
        }
        self.actions_star = np.array(
            [list(key) for key in optimal_actions_information.keys()])
        self.counts_star = np.array(
            [len(val) for val in optimal_actions_information.values()])
        self.thetas_star = []
        for val in optimal_actions_information.values():
            self.thetas_star += val
        self.thetas_star = np.array(self.thetas_star)
        self.a_star_entropy = sum([
            -p * np.log(p)
            for (action, (p, _)) in self.optimal_actions.items() if p > 0.0
        ])
        self.a_star_entropy = (self.max_entropy * self.a_star_entropy /
                               self.max_s_entropy)
 def act(self):
     posterior_belief = self.sample_from_posterior(n_samples=1)
     action = act_optimally(np.squeeze(posterior_belief),
                            top_k=self.assortment_size)
     self.current_action = action
     assert 0 in action if (not self.sampling) else True
     assert (self.top_item_index in action
             if self.top_item_index is not None else True)
     return action
Ejemplo n.º 3
0
 def action_selection(self):
     if self.top_item_index is None:
         fallback_taken, n_new = self.optimal_ids_action_parameters()
         action = self.sample_from_params(fallback_taken, n_new)
         return action
     else:
         return act_optimally(
             np.squeeze(self.sample_from_posterior(1)),
             top_k=self.assortment_size,
         )
 def update_r_star(self):
     sorted_beliefs = np.sort(
         self.posterior_belief,
         axis=1)[:, -self.assortment_size:]  # shape (m, k)
     picking_probabilities = sorted_beliefs.sum(1)
     if self.dynamics == "epoch":
         self.r_star = picking_probabilities.mean()
     else:
         self.r_star = (picking_probabilities /
                        (1 + picking_probabilities)).mean()
     a_greedy = act_optimally(self.posterior_belief.mean(0),
                              self.assortment_size)
     greedy_expected_reward = numba_expected_reward(self.posterior_belief,
                                                    a_greedy,
                                                    mode=self.dynamics)
     self.delta_min = self.r_star - greedy_expected_reward
     assert self.delta_min > -1e-12, (
         self.delta_min,
         self.r_star,
         greedy_expected_reward,
     )
     self.delta_min = max(1e-12, self.delta_min)
 def proposal(self):
     posterior_belief = self.sample_from_posterior(1)
     action = act_optimally(np.squeeze(posterior_belief),
                            top_k=self.assortment_size)
     self.current_action = action
     return action
Ejemplo n.º 6
0
    def proposal(self):
        # expected_rewards, stds = params_to_gaussian(self.posterior_parameters)
        # expected_rewards = np.minimum(expected_rewards, 1.0)

        posterior_belief = self.sample_from_posterior(self.n_samples)
        sorted_beliefs = np.sort(posterior_belief, axis=1)
        thresholds = sorted_beliefs[:, -self.assortment_size].reshape(-1, 1)

        best_actions = sorted_beliefs[:, -self.assortment_size:]
        sum_rewards_best = best_actions.sum(1)
        r_star = sum_rewards_best.mean()

        expected_rewards = posterior_belief.mean(0)
        # min_rew = expected_rewards.min() / 1e5
        # expected_rewards += np.random.rand(expected_rewards.shape[0]) * min_rew
        mask = posterior_belief >= thresholds
        p_star = mask.sum(0) / mask.shape[0]
        if_star = (posterior_belief * mask).sum(0) / (mask.sum(0) + 1e-12)
        # else_star = (posterior_belief * (1 - mask)).sum(0) / (
        #     (1 - mask).sum(0) + 1e-12
        # )
        # variances = (
        #     p_star * (if_star - expected_rewards) ** 2
        #     + (1 - p_star) * (else_star - expected_rewards) ** 2
        # )
        variances = p_star * (if_star - expected_rewards)**2
        # posterior_belief = self.sample_from_posterior(self.n_samples)
        # sorted_beliefs = np.sort(posterior_belief, axis=1)
        # thresholds = sorted_beliefs[:, -self.assortment_size].reshape(-1, 1)
        # mask = posterior_belief >= thresholds
        # p_star = mask.sum(0) / mask.shape[0]
        # variances *= p_star
        variances = np.maximum(variances, 1e-12)
        # a_star_t = np.sort(expected_rewards)[-self.assortment_size]
        # a_s = self.posterior_parameters[0]
        # b_s = self.posterior_parameters[1]
        # ps = beta.cdf(1 / (a_star_t + 1), a=a_s, b=b_s)
        # entropies_start = -(
        #     ps * np.log(np.maximum(ps, 1e-12))
        #     + (1 - ps) * np.log(np.maximum(1 - ps, +1e-12))
        # )
        # posterior_samples = 1 / beta.rvs(a=a_s, b=b_s) - 1
        # new_as = np.ones(self.n_items)
        # new_as += a_s
        # new_bs = (geom.rvs(1 / (posterior_samples + 1)) - 1) + b_s
        # new_ps = beta.cdf(1 / (a_star_t + 1), a=new_as, b=new_bs)
        # new_entropies = -(
        #     new_ps * np.log(np.maximum(new_ps, 1e-12))
        #     + (1 - new_ps) * np.log(np.maximum(1 - new_ps, +1e-12))
        # )
        # reductions = np.maximum(entropies_start - new_entropies, 1e-8)

        x = cp.Variable(self.n_items, pos=True)
        # deltas = cp.Parameter(self.n_items, pos=True)
        rewards = cp.Parameter(self.n_items, )
        gains = cp.Parameter(self.n_items, pos=True)
        # exp_regret = r_star - x @ rewards
        deltas = r_star - x @ rewards
        exp_gain = x @ gains
        information_ratio = cp.quad_over_lin(deltas, exp_gain)
        objective = cp.Minimize(information_ratio)
        constraints = [0 <= x, x <= 1, cp.sum(x) == self.assortment_size]
        prob = cp.Problem(
            objective,
            constraints,
        )
        rewards.value = expected_rewards
        gains.value = variances

        try:
            prob.solve(solver="ECOS")
            zeros_index = (x.value < 1e-3)
            ones_index = (x.value > 1 - 1e-3)
            nzeros = zeros_index.sum()
            nones = ones_index.sum()
            nitems = x.value.shape[0]
            logging.debug(
                f"{nitems - nones - nzeros} nstrict, {nones} ones, {nzeros} zeroes, {nitems} total items"
            )
            if (nitems - nones - nzeros) == 2:
                all_items = np.arange(nitems)
                strict_items = all_items[~np.
                                         bitwise_or(zeros_index, ones_index)]
                probas = x.value[~np.bitwise_or(zeros_index, ones_index)]
                assert strict_items.shape[0] == 2, strict_items
                assert probas.shape[0] == 2, probas
                # 2 items to randomize the selection over
                logging.debug(
                    f"items: {strict_items}, with probas: {probas}", )
                rho = probas[0]
                u = np.random.rand()
                if rho <= u:
                    remaning_item = strict_items[0]
                else:
                    remaning_item = strict_items[1]
                action = np.sort(
                    np.concatenate([
                        act_optimally(x.value, top_k=self.assortment_size - 1),
                        np.array([remaning_item])
                    ]))
            else:
                action = act_optimally(x.value, top_k=self.assortment_size)
            if self.c % 5 == 121234:
                logging.debug(
                    f"a:{action},x:{(100 * x.value).astype(int)},rew:{(100 * expected_rewards).astype(int)},gain:{(100 * np.sqrt(variances)).astype(int)}"
                )
                logging.debug(
                    f"if_optimal: {if_star}, rew:{logar(expected_rewards)}, probas: {logar(p_star)}",
                )
                logging.debug(
                    f"if_optimal: {logar(if_star)}, rew:{logar(expected_rewards)}, probas: {logar(p_star)}",
                )
                logging.debug(
                    f"n{self.posterior_parameters[0]}, v{self.posterior_parameters[1] / self.posterior_parameters[0]},"
                )
                logging.debug(f"obj{prob.value}")
        except cp.SolverError:
            logging.warning("solver error")
            posterior_belief = self.sample_from_posterior(1)
            action = act_optimally(np.squeeze(posterior_belief),
                                   top_k=self.assortment_size)
        except TypeError:
            logging.warning("solver error")
            posterior_belief = self.sample_from_posterior(1)
            action = act_optimally(np.squeeze(posterior_belief),
                                   top_k=self.assortment_size)

        self.current_action = action
        self.c += 1
        return action
Ejemplo n.º 7
0
 def ts_cs_action(self):
     posterior_belief = self.sample_from_posterior(1)
     return act_optimally(np.squeeze(posterior_belief),
                          top_k=self.assortment_size)
Ejemplo n.º 8
0
 def proposal(self):
     self.prior_belief = self.sample_from_posterior(
         self.ids_sampler.n_samples)
     self.ids_sampler.update_belief(self.prior_belief)
     greedy_proposal = (np.sqrt(self.ids_sampler.delta_min) <
                        self.regret_threshold)
     if greedy_proposal:
         assortment = act_optimally(
             np.squeeze(self.prior_belief.mean(0)),
             top_k=self.assortment_size,
         )
         g_approx = info_gain_step(
             action=assortment,
             sampled_preferences=self.prior_belief,
             actions_star=self.ids_sampler.actions_star,
             counts=self.ids_sampler.counts_star,
             thetas=self.ids_sampler.thetas_star,
         )
         g_approx = 1e-12 if g_approx < 1e-12 else g_approx
         d_approx = delta_step(
             action=assortment,
             sampled_preferences=self.prior_belief,
             r_star=self.ids_sampler.r_star,
         )
         rho_policy = 0.5
         ir_assortment = information_ratio(
             rho=rho_policy,
             d1=d_approx,
             d2=d_approx,
             g1=g_approx,
             g2=g_approx,
         )
         self.data_stored["greedy"].append(1)
     elif self.objective == "exact":
         assortment, ir_assortment, rho_policy = ids_exact_action(
             g_=self.ids_sampler.g_,
             d_=self.ids_sampler.d_,
             actions_set=self.all_actions,
             sampled_preferences=self.prior_belief,
             r_star=self.ids_sampler.r_star,
             actions_star=self.ids_sampler.actions_star,
             counts_star=self.ids_sampler.counts_star,
             thetas_star=self.ids_sampler.thetas_star,
         )
     else:
         assert self.objective == "lambda", "Choice of [exact, lambda]."
         if self.scaling == "autoreg":
             lambda_scaler = self.fitted_scaler
         elif self.scaling == "time":
             lambda_scaler = self.ids_sampler.lambda_algo * (
                 self.T - self.current_step)
         else:
             raise ValueError("Scaling: choice of [autoreg, time].")
         # print("check in the epoch setting")
         # print(self.ids_sampler.r_star)
         # print(
         #     self.ids_sampler.d_(
         #         np.arange(self.assortment_size),
         #         self.prior_belief,
         #         self.ids_sampler.r_star,
         #     )
         # )
         assortment, ir_assortment, rho_policy = greedy_ids_action(
             scaling_factor=lambda_scaler,
             g_=self.ids_sampler.g_,
             d_=self.ids_sampler.d_,
             sampled_preferences=self.prior_belief,
             r_star=self.ids_sampler.r_star,
             actions_star=self.ids_sampler.actions_star,
             counts_star=self.ids_sampler.counts_star,
             thetas_star=self.ids_sampler.thetas_star,
         )
         self.data_stored["greedy"].append(0)
     self.current_action = assortment
     self.fitted_scaler = ir_assortment
     self.data_stored["info_ratio"].append(ir_assortment)
     self.data_stored["entropy_a_star"].append(
         self.ids_sampler.a_star_entropy)
     self.data_stored["rho_policy"].append(rho_policy)
     self.data_stored["delta_min_2"].append(self.ids_sampler.delta_min**2)
     return assortment