def estimator_results_to_cpe_estimate( estimator_results: EstimatorResults, ) -> CpeEstimate: scores = torch.tensor( [r.estimated_reward for r in estimator_results.results], dtype=torch.double) log_scores = torch.tensor( [r.log_reward for r in estimator_results.results], dtype=torch.double) dr_score = float(torch.mean(scores).item()) dr_score_std_error = bootstrapped_std_error_of_mean(scores) log_score = float(torch.mean(log_scores).item()) if log_score < 1e-6: logger.warning("Can't normalize SDR-CPE because of small" f" or negative logged_policy_score ({log_score})." f"Episode values: {log_scores}.") return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / log_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / log_score, )
def estimator_result_to_cpe_estimate( result: EstimatorResult) -> CpeEstimate: assert result.estimated_reward_normalized is not None assert result.estimated_reward_normalized is not None assert result.estimated_reward_std_error is not None assert result.estimated_reward_normalized_std_error is not None return CpeEstimate( raw=result.estimated_reward, normalized=result.estimated_reward_normalized, raw_std_error=result.estimated_reward_std_error, normalized_std_error=result.estimated_reward_normalized_std_error, )
def estimate(self, edp: EvaluationDataPage) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1511.03722.pdf logged_rewards = edp.logged_rewards.squeeze() logged_propensities = edp.logged_propensities.squeeze() num_examples = edp.logged_rewards.shape[0] estimated_state_values = torch.sum(edp.model_propensities * edp.model_values, dim=1) estimated_q_values_for_logged_action = torch.sum(edp.model_values * edp.action_mask, dim=1) target_propensity_for_action = torch.sum(edp.model_propensities * edp.action_mask, dim=1) assert target_propensity_for_action.shape == logged_propensities.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_propensities.shape)) assert (target_propensity_for_action.shape == estimated_q_values_for_logged_action.shape), ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(estimated_q_values_for_logged_action.shape)) assert target_propensity_for_action.shape == logged_rewards.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_rewards.shape)) importance_weight = target_propensity_for_action / logged_propensities doubly_robusts: List[float] = [] episode_values: List[float] = [] assert edp.mdp_id is not None i = 0 last_episode_end = -1 while i < num_examples: # calculate the doubly-robust Q-value for one episode if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]: episode_end = i episode_value = 0.0 doubly_robust = 0.0 for j in range(episode_end, last_episode_end, -1): doubly_robust = estimated_state_values[ j] + importance_weight[j] * ( logged_rewards[j] + self.gamma * doubly_robust - estimated_q_values_for_logged_action[j]) episode_value *= self.gamma episode_value += logged_rewards[j] if episode_value > 1e-6 or episode_value < -1e-6: doubly_robusts.append(float(doubly_robust)) episode_values.append(float(episode_value)) last_episode_end = episode_end i += 1 assert len(doubly_robusts) > 0, ( f"No valid doubly robusts data is generated. " f"Logged_rewards={logged_rewards}, importance_weight={importance_weight}," f" estimated_q_values_for_logged_action={estimated_q_values_for_logged_action}" f" estimated_state_values={estimated_state_values}, gamma={self.gamma}" f" Did you specify wrong metric names?") doubly_robusts = np.array(doubly_robusts) # type: ignore dr_score = float(np.mean(doubly_robusts)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts) episode_values = np.array(episode_values) # type: ignore logged_policy_score = np.mean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize SDR-CPE because of small or negative logged_policy_score" ) return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / logged_policy_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / logged_policy_score, )
def estimate(self, edp: EvaluationDataPage) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1511.03722.pdf logged_rewards = edp.logged_rewards.squeeze() logged_propensities = edp.logged_propensities.squeeze() num_examples = edp.logged_rewards.shape[0] estimated_state_values = torch.sum(edp.model_propensities * edp.model_values, dim=1) estimated_q_values_for_logged_action = torch.sum(edp.model_values * edp.action_mask, dim=1) target_propensity_for_action = torch.sum(edp.model_propensities * edp.action_mask, dim=1) assert target_propensity_for_action.shape == logged_propensities.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_propensities.shape)) assert (target_propensity_for_action.shape == estimated_q_values_for_logged_action.shape), ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(estimated_q_values_for_logged_action.shape)) assert target_propensity_for_action.shape == logged_rewards.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_rewards.shape)) importance_weight = target_propensity_for_action / logged_propensities doubly_robusts: List[float] = [] episode_values: List[float] = [] assert edp.mdp_id is not None i = 0 last_episode_end = -1 while i < num_examples: # calculate the doubly-robust Q-value for one episode # pyre-ignore [16]: Optional type has no attribute `__getitem__` if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]: episode_end = i episode_value = 0.0 doubly_robust = 0.0 for j in range(episode_end, last_episode_end, -1): doubly_robust = estimated_state_values[ j] + importance_weight[j] * ( logged_rewards[j] + self.gamma * doubly_robust - estimated_q_values_for_logged_action[j]) episode_value *= self.gamma episode_value += logged_rewards[j] doubly_robusts.append(float(doubly_robust)) episode_values.append(float(episode_value)) last_episode_end = episode_end i += 1 if len(doubly_robusts) == 0: torch.set_printoptions(profile="full") zipped_data = list( zip(*map( lambda x: x.tolist(), [ edp.mdp_id, logged_rewards, estimated_state_values, estimated_q_values_for_logged_action, importance_weight, ], ))) raise RuntimeError( f"No valid doubly robusts data is generated.\n" f"mdp_ids x logged_rewards x estimated_state_values x " f"estimated_q_values_for_logged_action x importance_weight:\n" f"{zipped_data};\n" f"gamma={self.gamma};\n" f"Did you specify wrong metric names?") # pyre-fixme[9]: doubly_robusts has type `List[float]`; used as `ndarray`. doubly_robusts = np.array(doubly_robusts) dr_score = float(np.mean(doubly_robusts)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts) # pyre-fixme[9]: episode_values has type `List[float]`; used as `ndarray`. episode_values = np.array(episode_values) logged_policy_score = np.mean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize SDR-CPE because of small" f" or negative logged_policy_score ({logged_policy_score})." f"Episode values: {episode_values}.") return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / logged_policy_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / logged_policy_score, )
def estimate( self, edp: EvaluationDataPage, num_j_steps, whether_self_normalize_importance_weights, ) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8 assert edp.model_values is not None ( actions, rewards, logged_propensities, target_propensities, estimated_q_values, ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories( edp.mdp_id, edp.action_mask.cpu().numpy(), edp.logged_rewards.cpu().numpy().flatten(), edp.logged_propensities.cpu().numpy().flatten(), edp.model_propensities.cpu().numpy(), # pyre-fixme[16]: Optional type has no attribute `cpu`. edp.model_values.cpu().numpy(), ) num_trajectories = actions.shape[0] trajectory_length = actions.shape[1] j_steps = [float("inf")] if num_j_steps > 1: j_steps.append(-1) if num_j_steps > 2: interval = trajectory_length // (num_j_steps - 1) j_steps.extend([i * interval for i in range(1, num_j_steps - 1)]) target_propensity_for_logged_action = np.sum(np.multiply( target_propensities, actions), axis=2) estimated_q_values_for_logged_action = np.sum(np.multiply( estimated_q_values, actions), axis=2) estimated_state_values = np.sum(np.multiply(target_propensities, estimated_q_values), axis=2) importance_weights = target_propensity_for_logged_action / logged_propensities importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights) importance_weights_one_earlier = (np.ones([num_trajectories, 1]) * 1.0 / num_trajectories) importance_weights_one_earlier = np.hstack( [importance_weights_one_earlier, importance_weights[:, :-1]]) discounts = np.logspace(start=0, stop=trajectory_length - 1, num=trajectory_length, base=self.gamma) j_step_return_trajectories = [] for j_step in j_steps: j_step_return_trajectories.append( WeightedSequentialDoublyRobustEstimator.calculate_step_return( rewards, discounts, importance_weights, importance_weights_one_earlier, estimated_state_values, estimated_q_values_for_logged_action, j_step, )) j_step_return_trajectories = np.array(j_step_return_trajectories) j_step_returns = np.sum(j_step_return_trajectories, axis=1) if len(j_step_returns) == 1: weighted_doubly_robust = j_step_returns[0] weighted_doubly_robust_std_error = 0.0 else: # break trajectories into several subsets to estimate confidence bounds infinite_step_returns = [] num_subsets = int( min( num_trajectories / 2, WeightedSequentialDoublyRobustEstimator. NUM_SUBSETS_FOR_CB_ESTIMATES, )) interval = num_trajectories / num_subsets for i in range(num_subsets): trajectory_subset = np.arange(int(i * interval), int((i + 1) * interval)) importance_weights = ( target_propensity_for_logged_action[trajectory_subset] / logged_propensities[trajectory_subset]) importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights) importance_weights_one_earlier = ( np.ones([len(trajectory_subset), 1]) * 1.0 / len(trajectory_subset)) importance_weights_one_earlier = np.hstack([ importance_weights_one_earlier, importance_weights[:, :-1] ]) infinite_step_return = np.sum( WeightedSequentialDoublyRobustEstimator. calculate_step_return( rewards[trajectory_subset], discounts, importance_weights, importance_weights_one_earlier, estimated_state_values[trajectory_subset], estimated_q_values_for_logged_action[ trajectory_subset], float("inf"), )) infinite_step_returns.append(infinite_step_return) # Compute weighted_doubly_robust mean point estimate using all data weighted_doubly_robust = self.compute_weighted_doubly_robust_point_estimate( j_steps, num_j_steps, j_step_returns, infinite_step_returns, j_step_return_trajectories, ) # Use bootstrapping to compute weighted_doubly_robust standard error bootstrapped_means = [] sample_size = int( WeightedSequentialDoublyRobustEstimator.BOOTSTRAP_SAMPLE_PCT * num_subsets) for _ in range(WeightedSequentialDoublyRobustEstimator. NUM_BOOTSTRAP_SAMPLES): random_idxs = np.random.choice(num_j_steps, sample_size, replace=False) random_idxs.sort() wdr_estimate = self.compute_weighted_doubly_robust_point_estimate( j_steps=[j_steps[i] for i in random_idxs], num_j_steps=sample_size, j_step_returns=j_step_returns[random_idxs], infinite_step_returns=infinite_step_returns, j_step_return_trajectories=j_step_return_trajectories[ random_idxs], ) bootstrapped_means.append(wdr_estimate) weighted_doubly_robust_std_error = np.std(bootstrapped_means) episode_values = np.sum(np.multiply(rewards, discounts), axis=1) logged_policy_score = np.nanmean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize WSDR-CPE because of small or negative logged_policy_score" ) return CpeEstimate( raw=weighted_doubly_robust, normalized=0.0, raw_std_error=weighted_doubly_robust_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=weighted_doubly_robust, normalized=weighted_doubly_robust / logged_policy_score, raw_std_error=weighted_doubly_robust_std_error, normalized_std_error=weighted_doubly_robust_std_error / logged_policy_score, )
def _get_importance_sampling_estimates( self, isd: ImportanceSamplingData, hp: DoublyRobustHP ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]: # The score we would get if we evaluate the logged policy against itself logged_policy_score = float( torch.mean(isd.logged_rewards )) # logged_rewards is N*1 tensor of historical rewards if logged_policy_score < 1e-6: logger.warning( "Can't normalize DR-CPE because of small or negative " + "logged_policy_score") normalizer = 0.0 else: normalizer = 1.0 / logged_policy_score if isd.model_rewards is None: # Fill with zero, equivalent to just doing IPS direct_method_values = torch.zeros( [isd.model_propensities.shape[0], 1], dtype=torch.float32) else: # model rewards is (N_samples)*N_actions tensor of predicted # counterfactual rewards for each possible action at each # historical context direct_method_values = torch.sum(isd.model_propensities * isd.model_rewards, dim=1, keepdim=True) direct_method_score = float(torch.mean(direct_method_values)) direct_method_std_error = bootstrapped_std_error_of_mean( direct_method_values.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) direct_method_estimate = CpeEstimate( raw=direct_method_score, normalized=direct_method_score * normalizer, raw_std_error=direct_method_std_error, normalized_std_error=direct_method_std_error * normalizer, ) ips = isd.importance_weight * isd.logged_rewards # N*1 doubly_robust = ( isd.importance_weight * (isd.logged_rewards - isd.model_rewards_for_logged_action)) + direct_method_values # model_rewards_for_logged_action is N*1 of estimated rewards for target # policy ips_score = float(torch.mean(ips)) ips_score_std_error = bootstrapped_std_error_of_mean( ips.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) inverse_propensity_estimate = CpeEstimate( raw=ips_score, normalized=ips_score * normalizer, raw_std_error=ips_score_std_error, normalized_std_error=ips_score_std_error * normalizer, ) dr_score = float(torch.mean(doubly_robust)) dr_score_std_error = bootstrapped_std_error_of_mean( doubly_robust.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) doubly_robust_estimate = CpeEstimate( raw=dr_score, normalized=dr_score * normalizer, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error * normalizer, ) return ( direct_method_estimate, inverse_propensity_estimate, doubly_robust_estimate, )