def item_relevances(self, query_id: int, query_terms: Tuple[int], items: Iterable[Tuple[int, int]]) -> SlateItemValues: self._process_training_queries() if query_id in self._query_ids: q = self._query_ids[query_id] rels = q.url_relevances else: ras = {} for t in query_terms: if t in self._query_terms: q = self._query_terms[t] for i, r in q.url_relevances: if i in ras: ra = ras[i] else: ra = RunningAverage() ras[i] = ra ra.add(r) rels = {i: r.average for i, r in ras.items()} item_rels = {} for i in items: if i in rels: item_rels[i] = rels[i] else: item_rels[i] = 0.0 return SlateItemValues(item_rels)
def _mdps_value(self, mdps: Sequence[Mdp], gamma: float) -> float: self.zeta_net.eval() avg = RunningAverage() for mdp in mdps: discount = 1.0 r = 0.0 for t in mdp: assert t.last_state is not None, "Expected last_state, got None" assert t.action is not None, "Expected action, got None" zeta = self.zeta( torch.tensor(t.last_state.value, dtype=torch.float) .reshape(-1, self.state_dim) .to(self.device), torch.nn.functional.one_hot( torch.tensor(t.action.value, dtype=torch.long), self.action_dim ) .reshape(-1, self.action_dim) .float() .to(self.device), ) r += discount * t.reward * zeta.cpu().item() discount *= gamma avg.add(r) self.zeta_net.train() return avg.average
def _log_reward(self, gamma: float, mdps: Sequence[Mdp]) -> float: avg = RunningAverage() for mdp in mdps: discount = 1.0 r = 0.0 for t in mdp: r += discount * t.reward discount *= gamma avg.add(r) return avg.average
def evaluate(self, input: RLEstimatorInput, **kwargs) -> EstimatorResults: stime = time.process_time() dataset = self._collect_data(input) logging.info(f"Data loading time: {time.process_time() - stime}") zeta_optim = torch.optim.Adam(self.zeta_net.parameters(), lr=self.zeta_lr) v_optim = torch.optim.Adam(self.v_net.parameters(), lr=self.value_lr) avg_zeta_loss = RunningAverage() avg_v_loss = RunningAverage() sample_time = time.process_time() for sampled in range(self.training_samples): sample = self._sample_batch(dataset) zeta_loss = -(self._compute_loss(input.gamma, sample, False)) # Populate zeta gradients and optimize zeta_optim.zero_grad() zeta_loss.backward() zeta_optim.step() if self.deterministic_env: v_loss = self._compute_loss(input.gamma, sample, True) else: v_loss = self._compute_loss(*sample) # Populate value gradients and optimize v_optim.zero_grad() v_loss.backward() v_optim.step() avg_zeta_loss.add(zeta_loss.cpu().item()) avg_v_loss.add(v_loss.cpu().item()) if sampled % self.reporting_frequency == 0: report_time = time.process_time() - sample_time callback_time = None if self.loss_callback_fn is not None: # Pyre gets angry if we don't make callback local callback = self.loss_callback_fn assert callback is not None stime = time.process_time() callback(avg_zeta_loss.average, avg_v_loss.average, self) callback_time = abs(time.process_time() - stime) logging.info( f"Samples {sampled}, " f"Avg Zeta Loss {avg_zeta_loss.average}, " f"Avg Value Loss {avg_v_loss.average},\n" f"Time per {self.reporting_frequency} samples: {report_time}" + ( "" if callback_time is None else f", Time for callback: {callback_time}" ) ) avg_zeta_loss = RunningAverage() avg_v_loss = RunningAverage() sample_time = time.process_time() return self._compute_estimates(input)
def _estimate_value(self, gamma: float, mdps: Sequence[Mdp], value_function: ValueFunction) -> float: avg = RunningAverage() for mdp in mdps: discount = 1.0 r = 0.0 for t in mdp: if t.last_state is None: break r += discount * value_function(t.last_state) discount *= gamma avg.add(r) return avg.average
def evaluate( self, input: BanditsEstimatorInput, **kwargs ) -> Optional[EstimatorResult]: if input.has_model_outputs: return self._evaluate( input, input.samples, input.samples, force_train=True, **kwargs ) log_avg = RunningAverage() gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) gt_avg.add(sample.ground_truth_reward) # 2-fold cross "validation" as used by https://arxiv.org/pdf/1612.01205.pdf shuffled = list(input.samples) np.random.shuffle(shuffled) lower_half = shuffled[: len(shuffled) // 2] upper_half = shuffled[len(shuffled) // 2 :] er_lower = self._evaluate( input, lower_half, upper_half, force_train=True, **kwargs ) er_upper = self._evaluate( input, upper_half, lower_half, force_train=True, **kwargs ) if er_lower is None or er_upper is None: return None return EstimatorResult( log_reward=log_avg.average, estimated_reward=( (er_lower.estimated_reward + er_upper.estimated_reward) / 2 ), estimated_reward_normalized=( DMEstimator._calc_optional_avg( er_lower.estimated_reward_normalized, er_upper.estimated_reward_normalized, ) ), estimated_reward_normalized_std_error=( DMEstimator._calc_optional_avg( er_lower.estimated_reward_normalized_std_error, er_upper.estimated_reward_normalized_std_error, ) ), estimated_reward_std_error=( DMEstimator._calc_optional_avg( er_lower.estimated_reward_std_error, er_upper.estimated_reward_std_error, ) ), ground_truth_reward=gt_avg.average, )
def predict_item(self, query_id: int, query_terms: Tuple[int]) -> SlateItemValues: self._process_training_queries() if query_id in self._query_ids: q = self._query_ids[query_id] return SlateItemValues(dict(q.url_relevances.items())) else: rels = {} for t in query_terms: q = self._query_terms[t] for i, r in q.url_relevances: if i in rels: ra = rels[i] else: ra = RunningAverage() ra.add(r) return SlateItemValues({i: r.average for i, r in rels.items()})
def _estimate_value(self): tgt_generator = PolicyLogGenerator(self._env, self._policy) log = {} for state in self._env.states: mdps = [] for _ in range(self._num_episodes): mdps.append(tgt_generator.generate_log(state)) log[state] = mdps for state, mdps in log.items(): avg = RunningAverage() for mdp in mdps: discount = 1.0 r = 0.0 for t in mdp: r += discount * t.reward discount *= self._gamma avg.add(r) self._state_values[state] = avg.average
def estimate_value(episodes: int, max_horizon: int, policy: RLPolicy, gamma: float): avg = RunningAverage() env = gym.make("CartPole-v0") for _ in range(episodes): init_state = env.reset() cur_state = init_state r = 0.0 discount = 1.0 for _ in range(max_horizon): action_dist = policy(State(cur_state)) action = action_dist.sample()[0].value next_state, _, done, _ = env.step(action) reward = 1.0 r += reward * discount discount *= gamma if done: break cur_state = next_state avg.add(r) return avg.average
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> Optional[EstimatorResult]: if not self._train_model(input.samples, 0.8) and not input.has_model_outputs: return None log_avg = RunningAverage() tgt_avg = RunningAverage() tgt_vals = [] logged_vals = [] gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) logged_vals.append(sample.log_reward) _, tgt_reward = self._calc_dm_reward(input.action_space, sample) tgt_avg.add(tgt_reward) tgt_vals.append(tgt_reward) gt_avg.add(sample.ground_truth_reward) ( tgt_score, tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, ) = self._compute_metric_data(torch.tensor(tgt_vals), torch.tensor(logged_vals), tgt_avg.average) return EstimatorResult( log_avg.average, tgt_score, gt_avg.average, tgt_avg.count, tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, )
def evaluate(self, input: SlateEstimatorInput, *kwargs) -> EstimatorResults: input.validate() if input.tgt_model is None: logging.error("Target model is none, DM is not available") return self.results for episode in input.episodes: log_avg = RunningAverage() tgt_avg = RunningAverage() gt_avg = RunningAverage() tgt_slot_expects = episode.tgt_slot_expectations( episode.context.slots) if tgt_slot_expects is None: logging.warning(f"Target slot expectations not available") continue gt_slot_rewards = None if episode.gt_item_rewards is not None: gt_slot_rewards = tgt_slot_expects.expected_rewards( episode.gt_item_rewards) for sample in episode.samples: log_avg.add( episode.metric(episode.context.slots, sample.log_rewards)) tgt_item_rewards = input.tgt_model.item_rewards( episode.context) tgt_slot_rewards = tgt_slot_expects.expected_rewards( tgt_item_rewards) tgt_avg.add( episode.metric(episode.context.slots, tgt_slot_rewards)) if gt_slot_rewards is not None: gt_avg.add( episode.metric(episode.context.slots, gt_slot_rewards)) self._append_estimate(log_avg.average, tgt_avg.average, gt_avg.average) return self.results
def _calc_weight_reward_tensors( self, input: BanditsEstimatorInput, eval_samples: Sequence[LogSample] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, RunningAverage, RunningAverage, ]: n = len(eval_samples) ws = torch.ones((n, len(input.action_space))) rs = torch.zeros((n, 1)) r_est = torch.zeros((n, len(input.action_space))) actions = torch.zeros((n, len(input.action_space))) expected_rmax = torch.zeros((n, len(input.action_space))) propensities = torch.zeros((n, len(input.action_space))) log_avg = RunningAverage() gt_avg = RunningAverage() priori_rmax = self._estimate_rmax( input) if self._rmax is None else self._rmax assert priori_rmax is not None for i, sample in enumerate(eval_samples): _, dm_scores, dm_probs = self._calc_dm_reward( input.action_space, sample) for a in input.action_space: weight = (0.0 if sample.log_action_probabilities[a] < PROPENSITY_THRESHOLD else sample.tgt_action_probabilities[a] / sample.log_action_probabilities[a]) ws[i, a] = self._weight_clamper(weight) propensities[i, a] = sample.tgt_action_probabilities[a] expected_rmax[ i, a] = sample.tgt_action_probabilities[a] * priori_rmax actions[i, a] = float(a == sample.log_action) rs[i, 0] = sample.log_reward r_est[i] = dm_scores log_avg.add(sample.log_reward) gt_avg.add(sample.ground_truth_reward) return actions, ws, rs, r_est, propensities, expected_rmax, log_avg, gt_avg
def evaluate( self, input: BanditsEstimatorInput, **kwargs ) -> Optional[EstimatorResult]: logger.info("OPE IPS Evaluating") log_avg = RunningAverage() logged_vals = [] tgt_avg = RunningAverage() tgt_vals = [] acc_weight = RunningAverage() gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) logged_vals.append(sample.log_reward) weight = 0.0 tgt_result = 0.0 if sample.log_action.value is not None: weight = ( 0.0 if sample.log_action_probabilities[sample.log_action] < PROPENSITY_THRESHOLD else sample.tgt_action_probabilities[sample.log_action] / sample.log_action_probabilities[sample.log_action] ) weight = self._weight_clamper(weight) tgt_result = sample.log_reward * weight tgt_avg.add(tgt_result) tgt_vals.append(tgt_result) acc_weight.add(weight) gt_avg.add(sample.ground_truth_reward) ( tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average) return EstimatorResult( log_reward=log_avg.average, estimated_reward=tgt_avg.average if not self._weighted else tgt_avg.average / acc_weight.total, ground_truth_reward=gt_avg.average, estimated_weight=tgt_avg.count, estimated_reward_normalized=tgt_score_normalized, estimated_reward_std_error=tgt_std_err, estimated_reward_normalized_std_error=tgt_std_err_normalized, )
def _evaluate( self, input: BanditsEstimatorInput, train_samples: Sequence[LogSample], eval_samples: Sequence[LogSample], force_train: bool = False, **kwargs, ) -> Optional[EstimatorResult]: logger.info("OPE DR Evaluating") self._train_model(train_samples, force_train) log_avg = RunningAverage() tgt_avg = RunningAverage() tgt_vals = [] gt_avg = RunningAverage() for sample in eval_samples: log_avg.add(sample.log_reward) dm_action_reward, dm_scores, dm_probs = self._calc_dm_reward( input.action_space, sample ) dm_reward = torch.dot(dm_scores.reshape(-1), dm_probs.reshape(-1)).item() tgt_result = 0.0 weight = 0.0 if sample.log_action.value is not None: weight = ( 0.0 if sample.log_action_probabilities[sample.log_action] < PROPENSITY_THRESHOLD else sample.tgt_action_probabilities[sample.log_action] / sample.log_action_probabilities[sample.log_action] ) weight = self._weight_clamper(weight) assert dm_action_reward is not None assert dm_reward is not None tgt_result += ( sample.log_reward - dm_action_reward ) * weight + dm_reward else: tgt_result = dm_reward tgt_avg.add(tgt_result) tgt_vals.append(tgt_result) gt_avg.add(sample.ground_truth_reward) ( tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average) return EstimatorResult( log_reward=log_avg.average, estimated_reward=tgt_avg.average, ground_truth_reward=gt_avg.average, estimated_weight=tgt_avg.count, estimated_reward_normalized=tgt_score_normalized, estimated_reward_std_error=tgt_std_err, estimated_reward_normalized_std_error=tgt_std_err_normalized, )
def evaluate(self, input: SlateEstimatorInput, *kwargs) -> EstimatorResults: input.validate() for episode in input.episodes: log_avg = RunningAverage() tgt_avg = RunningAverage() acc_weight = 0.0 gt_avg = RunningAverage() log_slot_expects = episode.log_slot_item_expectations( episode.context.slots) if log_slot_expects is None: logging.warning(f"Log slot distribution not available") continue tgt_slot_expects = episode.tgt_slot_expectations( episode.context.slots) if tgt_slot_expects is None: logging.warning(f"Target slot distribution not available") continue log_indicator = log_slot_expects.values_tensor(self._device) tgt_indicator = tgt_slot_expects.values_tensor(self._device) lm = len(episode.context.slots) * len(episode.items) gamma = torch.pinverse( torch.mm(log_indicator.view((lm, 1)), log_indicator.view((1, lm)))) gt_slot_rewards = None if episode.gt_item_rewards is not None: gt_slot_rewards = tgt_slot_expects.expected_rewards( episode.gt_item_rewards) for sample in episode.samples: log_reward = episode.metric(episode.context.slots, sample.log_rewards) log_avg.add(log_reward) ones = sample.log_slate.one_hots(episode.items, self._device) weight = self._weight_clamper( torch.mm(tgt_indicator.view((1, lm)), torch.mm(gamma, ones.view(lm, 1)))) tgt_avg.add(log_reward * weight) acc_weight += weight if gt_slot_rewards is not None: gt_avg.add( episode.metric(episode.context.slots, gt_slot_rewards)) if tgt_avg.count == 0: continue if self._weighted: self._append_estimate(log_avg.average, tgt_avg.total / acc_weight, gt_avg.average) else: self._append_estimate(log_avg.average, tgt_avg.average, gt_avg.average) return self.results
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> Optional[EstimatorResult]: logger = Estimator.logger() if not self._train_model(input.samples, 0.8, logger): return None log_avg = RunningAverage() tgt_avg = RunningAverage() gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) _, tgt_reward = self._calc_dm_reward(input.action_space, sample) tgt_avg.add(tgt_reward) gt_avg.add(sample.ground_truth_reward) return EstimatorResult(log_avg.average, tgt_avg.average, gt_avg.average, tgt_avg.count)
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> Optional[EstimatorResult]: self._train_model(input.samples, 0.8) log_avg = RunningAverage() logged_vals = [] tgt_avg = RunningAverage() tgt_vals = [] gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) logged_vals.append(sample.log_reward) dm_action_reward, dm_reward = self._calc_dm_reward( input.action_space, sample) tgt_result = 0.0 weight = 0.0 if sample.log_action is not None: weight = (0.0 if sample.log_action_probabilities[sample.log_action] < PROPENSITY_THRESHOLD else sample.tgt_action_probabilities[sample.log_action] / sample.log_action_probabilities[sample.log_action]) weight = self._weight_clamper(weight) assert dm_action_reward is not None assert dm_reward is not None tgt_result += (sample.log_reward - dm_action_reward) * weight + dm_reward else: tgt_result = dm_reward tgt_avg.add(tgt_result) tgt_vals.append(tgt_result) gt_avg.add(sample.ground_truth_reward) ( tgt_score, tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, ) = self._compute_metric_data(torch.tensor(tgt_vals), torch.tensor(logged_vals), tgt_avg.average) return EstimatorResult( log_avg.average, tgt_score, gt_avg.average, tgt_avg.count, tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, )
def evaluate(self, input: SlateEstimatorInput, *kwargs) -> EstimatorResults: input.validate() for episode in input.episodes: log_avg = RunningAverage() tgt_avg = RunningAverage() acc_weight = 0.0 gt_avg = RunningAverage() gt_slot_rewards = None if episode.gt_item_rewards is not None: tgt_slot_expects = episode.tgt_slot_expectations( episode.context.slots) if tgt_slot_expects is not None: gt_slot_rewards = tgt_slot_expects.expected_rewards( episode.gt_item_rewards) for sample in episode.samples: log_prob = sample.log_slate_probability if log_prob <= 0.0: log_prob = episode.log_slate_probability(sample.log_slate) if log_prob <= 0.0: logging.warning( f"Invalid log slate probability: {log_prob}") continue tgt_prob = sample.tgt_slate_probability if tgt_prob <= 0.0: tgt_prob = episode.tgt_slate_probability(sample.log_slate) if tgt_prob <= 0.0: logging.warning(f"Invalid target probability: {tgt_prob}") continue weight = self._weight_clamper(tgt_prob / log_prob) log_reward = episode.metric(episode.context.slots, sample.log_rewards) log_avg.add(log_reward) tgt_avg.add(log_reward * weight) acc_weight += weight if gt_slot_rewards is not None: gt_avg.add( episode.metric(episode.context.slots, gt_slot_rewards)) if tgt_avg.count == 0: continue if self._weighted: self._append_estimate(log_avg.average, tgt_avg.total / acc_weight, gt_avg.average) else: self._append_estimate(log_avg.average, tgt_avg.average, gt_avg.average) return self.results
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> EstimatorResults: self.reset() for log in input.logs: log_reward = RunningAverage() tgt_reward = RunningAverage() gt_reward = RunningAverage() for sample in log.samples: log_reward.add(sample.logged_reward) rewards = input.target_model(sample.context) tgt_reward.add(rewards[sample.target_action]) rewards = input.ground_truth_model(sample.context) gt_reward.add(rewards[sample.target_action]) self._append_estimate( log_reward.average, tgt_reward.average, gt_reward.average ) return self.results
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> Optional[EstimatorResult]: log_avg = RunningAverage() tgt_avg = RunningAverage() acc_weight = RunningAverage() gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) weight = (sample.tgt_action_probabilities[sample.log_action] / sample.log_action_probabilities[sample.log_action]) weight = self._weight_clamper(weight) tgt_avg.add(sample.log_reward * weight) acc_weight.add(weight) gt_avg.add(sample.ground_truth_reward) if self._weighted: return EstimatorResult( log_avg.average, tgt_avg.total / acc_weight.total, gt_avg.average, acc_weight.average, ) else: return EstimatorResult(log_avg.average, tgt_avg.average, gt_avg.average, tgt_avg.count)
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> Optional[EstimatorResult]: logger = Estimator.logger() self._train_model(input.samples, 0.8, logger) log_avg = RunningAverage() tgt_avg = RunningAverage() gt_avg = RunningAverage() for sample in input.samples: log_avg.add(sample.log_reward) weight = (sample.tgt_action_probabilities[sample.log_action] / sample.log_action_probabilities[sample.log_action]) weight = self._weight_clamper(weight) dm_action_reward, dm_reward = self._calc_dm_reward( input.action_space, sample) tgt_avg.add((sample.log_reward - dm_action_reward) * weight + dm_reward) gt_avg.add(sample.ground_truth_reward) return EstimatorResult(log_avg.average, tgt_avg.average, gt_avg.average, tgt_avg.count)
def _evaluate( self, input: BanditsEstimatorInput, train_samples: Sequence[LogSample], eval_samples: Sequence[LogSample], force_train: bool = False, **kwargs, ) -> Optional[EstimatorResult]: logger.info("OPE DM Evaluating") if ( not self._train_model(train_samples, force_train) and not input.has_model_outputs ): return None log_avg = RunningAverage() tgt_avg = RunningAverage() tgt_vals = [] gt_avg = RunningAverage() for sample in eval_samples: log_avg.add(sample.log_reward) _, tgt_scores, tgt_probs = self._calc_dm_reward(input.action_space, sample) tgt_reward = torch.dot(tgt_scores.reshape(-1), tgt_probs.reshape(-1)).item() tgt_avg.add(tgt_reward) tgt_vals.append(tgt_reward) gt_avg.add(sample.ground_truth_reward) ( tgt_score_normalized, tgt_std_err, tgt_std_err_normalized, ) = self._compute_metric_data(torch.tensor(tgt_vals), log_avg.average) return EstimatorResult( log_reward=log_avg.average, estimated_reward=tgt_avg.average, ground_truth_reward=gt_avg.average, estimated_weight=tgt_avg.count, estimated_reward_normalized=tgt_score_normalized, estimated_reward_std_error=tgt_std_err, estimated_reward_normalized_std_error=tgt_std_err_normalized, )
def evaluate(self, input: BanditsEstimatorInput, **kwargs) -> EstimatorResults: self.reset() for log in input.logs: log_reward = RunningAverage() tgt_reward = RunningAverage() gt_reward = RunningAverage() for sample in log.samples: log_reward.add(sample.logged_reward) weight = ( sample.target_propensities[sample.logged_action] / sample.logged_propensities[sample.logged_action] ) weight = self._weight_clamper(weight) rewards = input.target_model(sample.context) r1 = rewards[sample.logged_action] r2 = rewards[sample.target_action] tgt_reward.add((sample.logged_reward - r1) * weight + r2) rewards = input.ground_truth_model(sample.context) gt_reward.add(rewards[sample.target_action]) self._append_estimate( log_reward.average, tgt_reward.average, gt_reward.average ) return self.results
def evaluate(self, input: SlateEstimatorInput, *kwargs) -> EstimatorResults: input.validate() for episode in input.episodes: log_avg = RunningAverage() tgt_avg = RunningAverage() acc_weight = 0.0 gt_avg = RunningAverage() log_slot_expects = episode.log_slot_item_expectations( episode.context.slots) if log_slot_expects is None: logging.warning(f"Log slot distribution not available") continue tgt_slot_expects = episode.tgt_slot_expectations( episode.context.slots) if tgt_slot_expects is None: logging.warning(f"Target slot distribution not available") continue slate_size = len(episode.context.slots) gt_slot_rewards = None if episode.gt_item_rewards is not None: gt_slot_rewards = tgt_slot_expects.expected_rewards( episode.gt_item_rewards) for sample in episode.samples: slot_weights = episode.metric.slot_weights( episode.context.slots) log_reward = episode.metric.calculate_reward( episode.context.slots, sample.log_rewards, None, slot_weights) log_avg.add(log_reward) weights = slot_weights.values.to(device=self._device) if sample.slot_probabilities is not None: weights *= sample.slot_probabilities.values h = torch.zeros(slate_size, dtype=torch.double, device=self._device) p = torch.zeros(slate_size, dtype=torch.double, device=self._device) i = 0 for slot, item in sample.log_slate: h[i] = log_slot_expects[slot][item] p[i] = tgt_slot_expects[slot][item] i += 1 ips = torch.tensordot(h, weights, dims=( [0], [0])) / torch.tensordot(p, weights, dims=([0], [0])) ips = self._weight_clamper(ips) if ips <= 0.0 or math.isinf(ips) or math.isnan(ips): continue tgt_avg.add(log_reward * ips) acc_weight += ips if gt_slot_rewards is not None: gt_avg.add( episode.metric.calculate_reward( episode.context.slots, gt_slot_rewards)) if tgt_avg.count == 0: continue if self._weighted: self._append_estimate(log_avg.average, tgt_avg.total / acc_weight, gt_avg.average) else: self._append_estimate(log_avg.average, tgt_avg.average, gt_avg.average) return self.results
def test_running_average(self): ra = RunningAverage() ra.add(1.0).add(2.0).add(3.0).add(4.0) self.assertEqual(ra.count, 4) self.assertEqual(ra.average, 2.5) self.assertEqual(ra.total, 10.0)