def evaluate(self, input: BanditsEstimatorInput,
              **kwargs) -> Optional[EstimatorResult]:
     if not self._train_model(input.samples,
                              0.8) and not input.has_model_outputs:
         return None
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     tgt_vals = []
     logged_vals = []
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         logged_vals.append(sample.log_reward)
         _, tgt_reward = self._calc_dm_reward(input.action_space, sample)
         tgt_avg.add(tgt_reward)
         tgt_vals.append(tgt_reward)
         gt_avg.add(sample.ground_truth_reward)
     (
         tgt_score,
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     ) = self._compute_metric_data(torch.tensor(tgt_vals),
                                   torch.tensor(logged_vals),
                                   tgt_avg.average)
     return EstimatorResult(
         log_avg.average,
         tgt_score,
         gt_avg.average,
         tgt_avg.count,
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     )
Example #2
0
 def evaluate(self, input: SlateEstimatorInput,
              *kwargs) -> EstimatorResults:
     input.validate()
     if input.tgt_model is None:
         logging.error("Target model is none, DM is not available")
         return self.results
     for episode in input.episodes:
         log_avg = RunningAverage()
         tgt_avg = RunningAverage()
         gt_avg = RunningAverage()
         tgt_slot_expects = episode.tgt_slot_expectations(
             episode.context.slots)
         if tgt_slot_expects is None:
             logging.warning(f"Target slot expectations not available")
             continue
         gt_slot_rewards = None
         if episode.gt_item_rewards is not None:
             gt_slot_rewards = tgt_slot_expects.expected_rewards(
                 episode.gt_item_rewards)
         for sample in episode.samples:
             log_avg.add(
                 episode.metric(episode.context.slots, sample.log_rewards))
             tgt_item_rewards = input.tgt_model.item_rewards(
                 episode.context)
             tgt_slot_rewards = tgt_slot_expects.expected_rewards(
                 tgt_item_rewards)
             tgt_avg.add(
                 episode.metric(episode.context.slots, tgt_slot_rewards))
             if gt_slot_rewards is not None:
                 gt_avg.add(
                     episode.metric(episode.context.slots, gt_slot_rewards))
         self._append_estimate(log_avg.average, tgt_avg.average,
                               gt_avg.average)
     return self.results
Example #3
0
    def evaluate(self, input: RLEstimatorInput, **kwargs) -> EstimatorResults:
        stime = time.process_time()
        dataset = self._collect_data(input)
        logging.info(f"Data loading time: {time.process_time() - stime}")

        zeta_optim = torch.optim.Adam(self.zeta_net.parameters(), lr=self.zeta_lr)
        v_optim = torch.optim.Adam(self.v_net.parameters(), lr=self.value_lr)
        avg_zeta_loss = RunningAverage()
        avg_v_loss = RunningAverage()
        sample_time = time.process_time()
        for sampled in range(self.training_samples):
            sample = self._sample_batch(dataset)

            zeta_loss = -(self._compute_loss(input.gamma, sample, False))
            # Populate zeta gradients and optimize
            zeta_optim.zero_grad()
            zeta_loss.backward()
            zeta_optim.step()

            if self.deterministic_env:
                v_loss = self._compute_loss(input.gamma, sample, True)
            else:
                v_loss = self._compute_loss(*sample)
            # Populate value gradients and optimize
            v_optim.zero_grad()
            v_loss.backward()
            v_optim.step()

            avg_zeta_loss.add(zeta_loss.cpu().item())
            avg_v_loss.add(v_loss.cpu().item())
            if sampled % self.reporting_frequency == 0:
                report_time = time.process_time() - sample_time
                callback_time = None
                if self.loss_callback_fn is not None:
                    # Pyre gets angry if we don't make callback local
                    callback = self.loss_callback_fn
                    assert callback is not None
                    stime = time.process_time()
                    callback(avg_zeta_loss.average, avg_v_loss.average, self)
                    callback_time = abs(time.process_time() - stime)
                logging.info(
                    f"Samples {sampled}, "
                    f"Avg Zeta Loss {avg_zeta_loss.average}, "
                    f"Avg Value Loss {avg_v_loss.average},\n"
                    f"Time per {self.reporting_frequency} samples: {report_time}"
                    + (
                        ""
                        if callback_time is None
                        else f", Time for callback: {callback_time}"
                    )
                )
                avg_zeta_loss = RunningAverage()
                avg_v_loss = RunningAverage()
                sample_time = time.process_time()
        return self._compute_estimates(input)
 def _evaluate(
     self,
     input: BanditsEstimatorInput,
     train_samples: Sequence[LogSample],
     eval_samples: Sequence[LogSample],
     force_train: bool = False,
     **kwargs,
 ) -> Optional[EstimatorResult]:
     logger.info("OPE DR Evaluating")
     self._train_model(train_samples, force_train)
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     tgt_vals = []
     gt_avg = RunningAverage()
     for sample in eval_samples:
         log_avg.add(sample.log_reward)
         dm_action_reward, dm_scores, dm_probs = self._calc_dm_reward(
             input.action_space, sample
         )
         dm_reward = torch.dot(dm_scores.reshape(-1), dm_probs.reshape(-1)).item()
         tgt_result = 0.0
         weight = 0.0
         if sample.log_action.value is not None:
             weight = (
                 0.0
                 if sample.log_action_probabilities[sample.log_action]
                 < PROPENSITY_THRESHOLD
                 else sample.tgt_action_probabilities[sample.log_action]
                 / sample.log_action_probabilities[sample.log_action]
             )
             weight = self._weight_clamper(weight)
             assert dm_action_reward is not None
             assert dm_reward is not None
             tgt_result += (
                 sample.log_reward - dm_action_reward
             ) * weight + dm_reward
         else:
             tgt_result = dm_reward
         tgt_avg.add(tgt_result)
         tgt_vals.append(tgt_result)
         gt_avg.add(sample.ground_truth_reward)
     (
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average)
     return EstimatorResult(
         log_reward=log_avg.average,
         estimated_reward=tgt_avg.average,
         ground_truth_reward=gt_avg.average,
         estimated_weight=tgt_avg.count,
         estimated_reward_normalized=tgt_score_normalized,
         estimated_reward_std_error=tgt_std_err,
         estimated_reward_normalized_std_error=tgt_std_err_normalized,
     )
    def evaluate(
        self, input: BanditsEstimatorInput, **kwargs
    ) -> Optional[EstimatorResult]:
        if input.has_model_outputs:
            return self._evaluate(
                input, input.samples, input.samples, force_train=True, **kwargs
            )
        log_avg = RunningAverage()
        gt_avg = RunningAverage()
        for sample in input.samples:
            log_avg.add(sample.log_reward)
            gt_avg.add(sample.ground_truth_reward)

        # 2-fold cross "validation" as used by https://arxiv.org/pdf/1612.01205.pdf
        shuffled = list(input.samples)
        np.random.shuffle(shuffled)
        lower_half = shuffled[: len(shuffled) // 2]
        upper_half = shuffled[len(shuffled) // 2 :]
        er_lower = self._evaluate(
            input, lower_half, upper_half, force_train=True, **kwargs
        )
        er_upper = self._evaluate(
            input, upper_half, lower_half, force_train=True, **kwargs
        )
        if er_lower is None or er_upper is None:
            return None
        return EstimatorResult(
            log_reward=log_avg.average,
            estimated_reward=(
                (er_lower.estimated_reward + er_upper.estimated_reward) / 2
            ),
            estimated_reward_normalized=(
                DMEstimator._calc_optional_avg(
                    er_lower.estimated_reward_normalized,
                    er_upper.estimated_reward_normalized,
                )
            ),
            estimated_reward_normalized_std_error=(
                DMEstimator._calc_optional_avg(
                    er_lower.estimated_reward_normalized_std_error,
                    er_upper.estimated_reward_normalized_std_error,
                )
            ),
            estimated_reward_std_error=(
                DMEstimator._calc_optional_avg(
                    er_lower.estimated_reward_std_error,
                    er_upper.estimated_reward_std_error,
                )
            ),
            ground_truth_reward=gt_avg.average,
        )
Example #6
0
 def evaluate(self, input: SlateEstimatorInput,
              *kwargs) -> EstimatorResults:
     input.validate()
     for episode in input.episodes:
         log_avg = RunningAverage()
         tgt_avg = RunningAverage()
         acc_weight = 0.0
         gt_avg = RunningAverage()
         log_slot_expects = episode.log_slot_item_expectations(
             episode.context.slots)
         if log_slot_expects is None:
             logging.warning(f"Log slot distribution not available")
             continue
         tgt_slot_expects = episode.tgt_slot_expectations(
             episode.context.slots)
         if tgt_slot_expects is None:
             logging.warning(f"Target slot distribution not available")
             continue
         log_indicator = log_slot_expects.values_tensor(self._device)
         tgt_indicator = tgt_slot_expects.values_tensor(self._device)
         lm = len(episode.context.slots) * len(episode.items)
         gamma = torch.pinverse(
             torch.mm(log_indicator.view((lm, 1)),
                      log_indicator.view((1, lm))))
         gt_slot_rewards = None
         if episode.gt_item_rewards is not None:
             gt_slot_rewards = tgt_slot_expects.expected_rewards(
                 episode.gt_item_rewards)
         for sample in episode.samples:
             log_reward = episode.metric(episode.context.slots,
                                         sample.log_rewards)
             log_avg.add(log_reward)
             ones = sample.log_slate.one_hots(episode.items, self._device)
             weight = self._weight_clamper(
                 torch.mm(tgt_indicator.view((1, lm)),
                          torch.mm(gamma, ones.view(lm, 1))))
             tgt_avg.add(log_reward * weight)
             acc_weight += weight
             if gt_slot_rewards is not None:
                 gt_avg.add(
                     episode.metric(episode.context.slots, gt_slot_rewards))
         if tgt_avg.count == 0:
             continue
         if self._weighted:
             self._append_estimate(log_avg.average,
                                   tgt_avg.total / acc_weight,
                                   gt_avg.average)
         else:
             self._append_estimate(log_avg.average, tgt_avg.average,
                                   gt_avg.average)
     return self.results
 def evaluate(self, input: BanditsEstimatorInput,
              **kwargs) -> Optional[EstimatorResult]:
     logger = Estimator.logger()
     if not self._train_model(input.samples, 0.8, logger):
         return None
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         _, tgt_reward = self._calc_dm_reward(input.action_space, sample)
         tgt_avg.add(tgt_reward)
         gt_avg.add(sample.ground_truth_reward)
     return EstimatorResult(log_avg.average, tgt_avg.average,
                            gt_avg.average, tgt_avg.count)
 def evaluate(self, input: BanditsEstimatorInput,
              **kwargs) -> Optional[EstimatorResult]:
     self._train_model(input.samples, 0.8)
     log_avg = RunningAverage()
     logged_vals = []
     tgt_avg = RunningAverage()
     tgt_vals = []
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         logged_vals.append(sample.log_reward)
         dm_action_reward, dm_reward = self._calc_dm_reward(
             input.action_space, sample)
         tgt_result = 0.0
         weight = 0.0
         if sample.log_action is not None:
             weight = (0.0
                       if sample.log_action_probabilities[sample.log_action]
                       < PROPENSITY_THRESHOLD else
                       sample.tgt_action_probabilities[sample.log_action] /
                       sample.log_action_probabilities[sample.log_action])
             weight = self._weight_clamper(weight)
             assert dm_action_reward is not None
             assert dm_reward is not None
             tgt_result += (sample.log_reward -
                            dm_action_reward) * weight + dm_reward
         else:
             tgt_result = dm_reward
         tgt_avg.add(tgt_result)
         tgt_vals.append(tgt_result)
         gt_avg.add(sample.ground_truth_reward)
     (
         tgt_score,
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     ) = self._compute_metric_data(torch.tensor(tgt_vals),
                                   torch.tensor(logged_vals),
                                   tgt_avg.average)
     return EstimatorResult(
         log_avg.average,
         tgt_score,
         gt_avg.average,
         tgt_avg.count,
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     )
Example #9
0
 def evaluate(self, input: SlateEstimatorInput,
              *kwargs) -> EstimatorResults:
     input.validate()
     for episode in input.episodes:
         log_avg = RunningAverage()
         tgt_avg = RunningAverage()
         acc_weight = 0.0
         gt_avg = RunningAverage()
         gt_slot_rewards = None
         if episode.gt_item_rewards is not None:
             tgt_slot_expects = episode.tgt_slot_expectations(
                 episode.context.slots)
             if tgt_slot_expects is not None:
                 gt_slot_rewards = tgt_slot_expects.expected_rewards(
                     episode.gt_item_rewards)
         for sample in episode.samples:
             log_prob = sample.log_slate_probability
             if log_prob <= 0.0:
                 log_prob = episode.log_slate_probability(sample.log_slate)
             if log_prob <= 0.0:
                 logging.warning(
                     f"Invalid log slate probability: {log_prob}")
                 continue
             tgt_prob = sample.tgt_slate_probability
             if tgt_prob <= 0.0:
                 tgt_prob = episode.tgt_slate_probability(sample.log_slate)
             if tgt_prob <= 0.0:
                 logging.warning(f"Invalid target probability: {tgt_prob}")
                 continue
             weight = self._weight_clamper(tgt_prob / log_prob)
             log_reward = episode.metric(episode.context.slots,
                                         sample.log_rewards)
             log_avg.add(log_reward)
             tgt_avg.add(log_reward * weight)
             acc_weight += weight
             if gt_slot_rewards is not None:
                 gt_avg.add(
                     episode.metric(episode.context.slots, gt_slot_rewards))
         if tgt_avg.count == 0:
             continue
         if self._weighted:
             self._append_estimate(log_avg.average,
                                   tgt_avg.total / acc_weight,
                                   gt_avg.average)
         else:
             self._append_estimate(log_avg.average, tgt_avg.average,
                                   gt_avg.average)
     return self.results
Example #10
0
    def _mdps_value(self, mdps: Sequence[Mdp], gamma: float) -> float:
        self.zeta_net.eval()
        avg = RunningAverage()

        for mdp in mdps:
            discount = 1.0
            r = 0.0
            for t in mdp:
                assert t.last_state is not None, "Expected last_state, got None"
                assert t.action is not None, "Expected action, got None"
                zeta = self.zeta(
                    torch.tensor(t.last_state.value, dtype=torch.float)
                    .reshape(-1, self.state_dim)
                    .to(self.device),
                    torch.nn.functional.one_hot(
                        torch.tensor(t.action.value, dtype=torch.long), self.action_dim
                    )
                    .reshape(-1, self.action_dim)
                    .float()
                    .to(self.device),
                )
                r += discount * t.reward * zeta.cpu().item()
                discount *= gamma
            avg.add(r)
        self.zeta_net.train()
        return avg.average
 def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> EstimatorResults:
     self.reset()
     for log in input.logs:
         log_reward = RunningAverage()
         tgt_reward = RunningAverage()
         gt_reward = RunningAverage()
         for sample in log.samples:
             log_reward.add(sample.logged_reward)
             rewards = input.target_model(sample.context)
             tgt_reward.add(rewards[sample.target_action])
             rewards = input.ground_truth_model(sample.context)
             gt_reward.add(rewards[sample.target_action])
         self._append_estimate(
             log_reward.average, tgt_reward.average, gt_reward.average
         )
     return self.results
Example #12
0
 def _process_training_queries(self):
     if (self._query_ids is not None and self._query_terms is not None
             and self._position_relevances is not None):
         return
     logging.info("processing training queries...")
     st = time.process_time()
     self._query_ids = {}
     self._query_terms = {}
     self._position_relevances = [
         RunningAverage() for _ in range(MAX_POSITION)
     ]
     for q in self._queries:
         q.unpack()
         self._query_ids[q.query_id] = q
         for t in q.query_terms:
             if t in self._query_terms:
                 self._query_terms[t].merge(q)
             else:
                 mq = ProcessedQuery(0, (t, ))
                 mq.merge(q)
                 self._query_terms[t] = mq
         for ra, r in zip(self._position_relevances, q.position_relevances):
             ra.add(r)
     for q in self._query_terms.values():
         q.finalize()
     self._position_relevances = [
         v.average for v in self._position_relevances
     ]
     logging.info(f"processing time {time.process_time() - st}")
Example #13
0
 def item_relevances(self, query_id: int, query_terms: Tuple[int],
                     items: Iterable[Tuple[int, int]]) -> SlateItemValues:
     self._process_training_queries()
     if query_id in self._query_ids:
         q = self._query_ids[query_id]
         rels = q.url_relevances
     else:
         ras = {}
         for t in query_terms:
             if t in self._query_terms:
                 q = self._query_terms[t]
                 for i, r in q.url_relevances:
                     if i in ras:
                         ra = ras[i]
                     else:
                         ra = RunningAverage()
                         ras[i] = ra
                     ra.add(r)
         rels = {i: r.average for i, r in ras.items()}
     item_rels = {}
     for i in items:
         if i in rels:
             item_rels[i] = rels[i]
         else:
             item_rels[i] = 0.0
     return SlateItemValues(item_rels)
Example #14
0
 def merge(self, other: "TrainingQuery"):
     for i, r in other.url_relevances.items():
         if i not in self._url_relevances:
             self._url_relevances[i] = RunningAverage(r)
         else:
             self._url_relevances[i].add(r)
     for i in range(MAX_SLATE_SIZE):
         self._position_relevances[i].add(other.position_relevances[i])
 def evaluate(
     self, input: BanditsEstimatorInput, **kwargs
 ) -> Optional[EstimatorResult]:
     logger.info("OPE IPS Evaluating")
     log_avg = RunningAverage()
     logged_vals = []
     tgt_avg = RunningAverage()
     tgt_vals = []
     acc_weight = RunningAverage()
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         logged_vals.append(sample.log_reward)
         weight = 0.0
         tgt_result = 0.0
         if sample.log_action.value is not None:
             weight = (
                 0.0
                 if sample.log_action_probabilities[sample.log_action]
                 < PROPENSITY_THRESHOLD
                 else sample.tgt_action_probabilities[sample.log_action]
                 / sample.log_action_probabilities[sample.log_action]
             )
             weight = self._weight_clamper(weight)
             tgt_result = sample.log_reward * weight
         tgt_avg.add(tgt_result)
         tgt_vals.append(tgt_result)
         acc_weight.add(weight)
         gt_avg.add(sample.ground_truth_reward)
     (
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average)
     return EstimatorResult(
         log_reward=log_avg.average,
         estimated_reward=tgt_avg.average
         if not self._weighted
         else tgt_avg.average / acc_weight.total,
         ground_truth_reward=gt_avg.average,
         estimated_weight=tgt_avg.count,
         estimated_reward_normalized=tgt_score_normalized,
         estimated_reward_std_error=tgt_std_err,
         estimated_reward_normalized_std_error=tgt_std_err_normalized,
     )
Example #16
0
 def __init__(self, query_id: int, query_terms: Tuple[int]):
     self._query_id = query_id
     self._query_terms = query_terms
     self._count = 0
     self._url_relevances: MutableMapping[Tuple[int, int],
                                          RunningAverage] = {}
     self._position_relevances = [
         RunningAverage() for _ in range(MAX_SLATE_SIZE)
     ]
Example #17
0
 def _log_reward(self, gamma: float, mdps: Sequence[Mdp]) -> float:
     avg = RunningAverage()
     for mdp in mdps:
         discount = 1.0
         r = 0.0
         for t in mdp:
             r += discount * t.reward
             discount *= gamma
         avg.add(r)
     return avg.average
 def evaluate(self, input: BanditsEstimatorInput,
              **kwargs) -> Optional[EstimatorResult]:
     logger = Estimator.logger()
     self._train_model(input.samples, 0.8, logger)
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         weight = (sample.tgt_action_probabilities[sample.log_action] /
                   sample.log_action_probabilities[sample.log_action])
         weight = self._weight_clamper(weight)
         dm_action_reward, dm_reward = self._calc_dm_reward(
             input.action_space, sample)
         tgt_avg.add((sample.log_reward - dm_action_reward) * weight +
                     dm_reward)
         gt_avg.add(sample.ground_truth_reward)
     return EstimatorResult(log_avg.average, tgt_avg.average,
                            gt_avg.average, tgt_avg.count)
Example #19
0
 def add(self, query: LoggedQuery):
     self._count += 1
     urs = query.url_relevances
     for item_id, r in urs.items():
         if item_id not in self._url_relevances:
             self._url_relevances[item_id] = RunningAverage(r)
         else:
             self._url_relevances[item_id].add(r)
     prs = query.position_relevances
     for i in range(MAX_SLATE_SIZE):
         self._position_relevances[i].add(prs[i])
Example #20
0
    def _calc_weight_reward_tensors(
        self, input: BanditsEstimatorInput, eval_samples: Sequence[LogSample]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
               torch.Tensor, torch.Tensor, RunningAverage, RunningAverage, ]:
        n = len(eval_samples)
        ws = torch.ones((n, len(input.action_space)))
        rs = torch.zeros((n, 1))
        r_est = torch.zeros((n, len(input.action_space)))
        actions = torch.zeros((n, len(input.action_space)))
        expected_rmax = torch.zeros((n, len(input.action_space)))
        propensities = torch.zeros((n, len(input.action_space)))

        log_avg = RunningAverage()
        gt_avg = RunningAverage()

        priori_rmax = self._estimate_rmax(
            input) if self._rmax is None else self._rmax
        assert priori_rmax is not None

        for i, sample in enumerate(eval_samples):
            _, dm_scores, dm_probs = self._calc_dm_reward(
                input.action_space, sample)
            for a in input.action_space:
                weight = (0.0 if sample.log_action_probabilities[a] <
                          PROPENSITY_THRESHOLD else
                          sample.tgt_action_probabilities[a] /
                          sample.log_action_probabilities[a])
                ws[i, a] = self._weight_clamper(weight)
                propensities[i, a] = sample.tgt_action_probabilities[a]
                expected_rmax[
                    i, a] = sample.tgt_action_probabilities[a] * priori_rmax
                actions[i, a] = float(a == sample.log_action)

            rs[i, 0] = sample.log_reward
            r_est[i] = dm_scores
            log_avg.add(sample.log_reward)
            gt_avg.add(sample.ground_truth_reward)

        return actions, ws, rs, r_est, propensities, expected_rmax, log_avg, gt_avg
Example #21
0
 def _estimate_value(self, gamma: float, mdps: Sequence[Mdp],
                     value_function: ValueFunction) -> float:
     avg = RunningAverage()
     for mdp in mdps:
         discount = 1.0
         r = 0.0
         for t in mdp:
             if t.last_state is None:
                 break
             r += discount * value_function(t.last_state)
             discount *= gamma
         avg.add(r)
     return avg.average
 def _evaluate(
     self,
     input: BanditsEstimatorInput,
     train_samples: Sequence[LogSample],
     eval_samples: Sequence[LogSample],
     force_train: bool = False,
     **kwargs,
 ) -> Optional[EstimatorResult]:
     logger.info("OPE DM Evaluating")
     if (
         not self._train_model(train_samples, force_train)
         and not input.has_model_outputs
     ):
         return None
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     tgt_vals = []
     gt_avg = RunningAverage()
     for sample in eval_samples:
         log_avg.add(sample.log_reward)
         _, tgt_scores, tgt_probs = self._calc_dm_reward(input.action_space, sample)
         tgt_reward = torch.dot(tgt_scores.reshape(-1), tgt_probs.reshape(-1)).item()
         tgt_avg.add(tgt_reward)
         tgt_vals.append(tgt_reward)
         gt_avg.add(sample.ground_truth_reward)
     (
         tgt_score_normalized,
         tgt_std_err,
         tgt_std_err_normalized,
     ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average)
     return EstimatorResult(
         log_reward=log_avg.average,
         estimated_reward=tgt_avg.average,
         ground_truth_reward=gt_avg.average,
         estimated_weight=tgt_avg.count,
         estimated_reward_normalized=tgt_score_normalized,
         estimated_reward_std_error=tgt_std_err,
         estimated_reward_normalized_std_error=tgt_std_err_normalized,
     )
 def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> EstimatorResults:
     self.reset()
     for log in input.logs:
         log_reward = RunningAverage()
         tgt_reward = RunningAverage()
         gt_reward = RunningAverage()
         for sample in log.samples:
             log_reward.add(sample.logged_reward)
             weight = (
                 sample.target_propensities[sample.logged_action]
                 / sample.logged_propensities[sample.logged_action]
             )
             weight = self._weight_clamper(weight)
             rewards = input.target_model(sample.context)
             r1 = rewards[sample.logged_action]
             r2 = rewards[sample.target_action]
             tgt_reward.add((sample.logged_reward - r1) * weight + r2)
             rewards = input.ground_truth_model(sample.context)
             gt_reward.add(rewards[sample.target_action])
         self._append_estimate(
             log_reward.average, tgt_reward.average, gt_reward.average
         )
     return self.results
 def evaluate(self, input: BanditsEstimatorInput,
              **kwargs) -> Optional[EstimatorResult]:
     log_avg = RunningAverage()
     tgt_avg = RunningAverage()
     acc_weight = RunningAverage()
     gt_avg = RunningAverage()
     for sample in input.samples:
         log_avg.add(sample.log_reward)
         weight = (sample.tgt_action_probabilities[sample.log_action] /
                   sample.log_action_probabilities[sample.log_action])
         weight = self._weight_clamper(weight)
         tgt_avg.add(sample.log_reward * weight)
         acc_weight.add(weight)
         gt_avg.add(sample.ground_truth_reward)
     if self._weighted:
         return EstimatorResult(
             log_avg.average,
             tgt_avg.total / acc_weight.total,
             gt_avg.average,
             acc_weight.average,
         )
     else:
         return EstimatorResult(log_avg.average, tgt_avg.average,
                                gt_avg.average, tgt_avg.count)
Example #25
0
 def predict_item(self, query_id: int,
                  query_terms: Tuple[int]) -> SlateItemValues:
     self._process_training_queries()
     if query_id in self._query_ids:
         q = self._query_ids[query_id]
         return SlateItemValues(dict(q.url_relevances.items()))
     else:
         rels = {}
         for t in query_terms:
             q = self._query_terms[t]
             for i, r in q.url_relevances:
                 if i in rels:
                     ra = rels[i]
                 else:
                     ra = RunningAverage()
                 ra.add(r)
         return SlateItemValues({i: r.average for i, r in rels.items()})
    def _estimate_value(self):
        tgt_generator = PolicyLogGenerator(self._env, self._policy)
        log = {}
        for state in self._env.states:
            mdps = []
            for _ in range(self._num_episodes):
                mdps.append(tgt_generator.generate_log(state))
            log[state] = mdps

        for state, mdps in log.items():
            avg = RunningAverage()
            for mdp in mdps:
                discount = 1.0
                r = 0.0
                for t in mdp:
                    r += discount * t.reward
                    discount *= self._gamma
                avg.add(r)
            self._state_values[state] = avg.average
Example #27
0
def estimate_value(episodes: int, max_horizon: int, policy: RLPolicy, gamma: float):
    avg = RunningAverage()
    env = gym.make("CartPole-v0")
    for _ in range(episodes):
        init_state = env.reset()
        cur_state = init_state
        r = 0.0
        discount = 1.0
        for _ in range(max_horizon):
            action_dist = policy(State(cur_state))
            action = action_dist.sample()[0].value
            next_state, _, done, _ = env.step(action)
            reward = 1.0
            r += reward * discount
            discount *= gamma
            if done:
                break
            cur_state = next_state
        avg.add(r)
    return avg.average
Example #28
0
 def test_running_average(self):
     ra = RunningAverage()
     ra.add(1.0).add(2.0).add(3.0).add(4.0)
     self.assertEqual(ra.count, 4)
     self.assertEqual(ra.average, 2.5)
     self.assertEqual(ra.total, 10.0)
Example #29
0
 def evaluate(self, input: SlateEstimatorInput,
              *kwargs) -> EstimatorResults:
     input.validate()
     for episode in input.episodes:
         log_avg = RunningAverage()
         tgt_avg = RunningAverage()
         acc_weight = 0.0
         gt_avg = RunningAverage()
         log_slot_expects = episode.log_slot_item_expectations(
             episode.context.slots)
         if log_slot_expects is None:
             logging.warning(f"Log slot distribution not available")
             continue
         tgt_slot_expects = episode.tgt_slot_expectations(
             episode.context.slots)
         if tgt_slot_expects is None:
             logging.warning(f"Target slot distribution not available")
             continue
         slate_size = len(episode.context.slots)
         gt_slot_rewards = None
         if episode.gt_item_rewards is not None:
             gt_slot_rewards = tgt_slot_expects.expected_rewards(
                 episode.gt_item_rewards)
         for sample in episode.samples:
             slot_weights = episode.metric.slot_weights(
                 episode.context.slots)
             log_reward = episode.metric.calculate_reward(
                 episode.context.slots, sample.log_rewards, None,
                 slot_weights)
             log_avg.add(log_reward)
             weights = slot_weights.values.to(device=self._device)
             if sample.slot_probabilities is not None:
                 weights *= sample.slot_probabilities.values
             h = torch.zeros(slate_size,
                             dtype=torch.double,
                             device=self._device)
             p = torch.zeros(slate_size,
                             dtype=torch.double,
                             device=self._device)
             i = 0
             for slot, item in sample.log_slate:
                 h[i] = log_slot_expects[slot][item]
                 p[i] = tgt_slot_expects[slot][item]
                 i += 1
             ips = torch.tensordot(h, weights, dims=(
                 [0], [0])) / torch.tensordot(p, weights, dims=([0], [0]))
             ips = self._weight_clamper(ips)
             if ips <= 0.0 or math.isinf(ips) or math.isnan(ips):
                 continue
             tgt_avg.add(log_reward * ips)
             acc_weight += ips
             if gt_slot_rewards is not None:
                 gt_avg.add(
                     episode.metric.calculate_reward(
                         episode.context.slots, gt_slot_rewards))
         if tgt_avg.count == 0:
             continue
         if self._weighted:
             self._append_estimate(log_avg.average,
                                   tgt_avg.total / acc_weight,
                                   gt_avg.average)
         else:
             self._append_estimate(log_avg.average, tgt_avg.average,
                                   gt_avg.average)
     return self.results