def _get_importance_sampling_estimates( self, isd: ImportanceSamplingData, hp: DoublyRobustHP ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]: # The score we would get if we evaluate the logged policy against itself logged_policy_score = float( torch.mean(isd.logged_rewards) ) # logged_rewards is N*1 tensor of historical rewards if logged_policy_score < 1e-6: logger.warning( "Can't normalize DR-CPE because of small or negative " + "logged_policy_score" ) normalizer = 0.0 else: normalizer = 1.0 / logged_policy_score if isd.model_rewards is None: # Fill with zero, equivalent to just doing IPS direct_method_values = torch.zeros( [isd.model_propensities.shape[0], 1], dtype=torch.float32 ) else: # model rewards is (N_samples)*N*N_actions tensor of predicted # counterfactual rewards for each possible action at each # historical context direct_method_values = torch.sum( isd.model_propensities * isd.model_rewards, dim=1, keepdim=True ) direct_method_score = float(torch.mean(direct_method_values)) direct_method_std_error = bootstrapped_std_error_of_mean( direct_method_values.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) direct_method_estimate = CpeEstimate( raw=direct_method_score, normalized=direct_method_score * normalizer, raw_std_error=direct_method_std_error, normalized_std_error=direct_method_std_error * normalizer, ) ips = isd.importance_weight * isd.logged_rewards # N*1 doubly_robust = ( isd.importance_weight * (isd.logged_rewards - isd.model_rewards_for_logged_action) ) + direct_method_values # model_rewards_for_logged_action is N*1 of estimated rewards for target # policy ips_score = float(torch.mean(ips)) ips_score_std_error = bootstrapped_std_error_of_mean( ips.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) inverse_propensity_estimate = CpeEstimate( raw=ips_score, normalized=ips_score * normalizer, raw_std_error=ips_score_std_error, normalized_std_error=ips_score_std_error * normalizer, ) dr_score = float(torch.mean(doubly_robust)) dr_score_std_error = bootstrapped_std_error_of_mean( doubly_robust.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) doubly_robust_estimate = CpeEstimate( raw=dr_score, normalized=dr_score * normalizer, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error * normalizer, ) return ( direct_method_estimate, inverse_propensity_estimate, doubly_robust_estimate, )
def estimate(self, edp: EvaluationDataPage) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1511.03722.pdf logged_rewards = edp.logged_rewards.squeeze() logged_propensities = edp.logged_propensities.squeeze() num_examples = edp.logged_rewards.shape[0] estimated_state_values = torch.sum(edp.model_propensities * edp.model_values, dim=1) estimated_q_values_for_logged_action = torch.sum(edp.model_values * edp.action_mask, dim=1) target_propensity_for_action = torch.sum(edp.model_propensities * edp.action_mask, dim=1) assert target_propensity_for_action.shape == logged_propensities.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_propensities.shape)) assert (target_propensity_for_action.shape == estimated_q_values_for_logged_action.shape), ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(estimated_q_values_for_logged_action.shape)) assert target_propensity_for_action.shape == logged_rewards.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_rewards.shape)) importance_weight = target_propensity_for_action / logged_propensities doubly_robusts: List[float] = [] episode_values: List[float] = [] i = 0 last_episode_end = -1 while i < num_examples: # calculate the doubly-robust Q-value for one episode if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]: episode_end = i episode_value = 0.0 doubly_robust = 0.0 for j in range(episode_end, last_episode_end, -1): doubly_robust = estimated_state_values[ j] + importance_weight[j] * ( logged_rewards[j] + self.gamma * doubly_robust - estimated_q_values_for_logged_action[j]) episode_value *= self.gamma episode_value += logged_rewards[j] if episode_value > 1e-6 or episode_value < -1e-6: doubly_robusts.append(float(doubly_robust)) episode_values.append(float(episode_value)) last_episode_end = episode_end i += 1 doubly_robusts = np.array(doubly_robusts) dr_score = float(np.mean(doubly_robusts)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts) episode_values = np.array(episode_values) logged_policy_score = np.mean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize SDR-CPE because of small or negative logged_policy_score" ) return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / logged_policy_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / logged_policy_score, )
def estimate( self, edp: EvaluationDataPage, num_j_steps, whether_self_normalize_importance_weights, ) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8 ( actions, rewards, logged_propensities, target_propensities, estimated_q_values, ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories( edp.mdp_id, edp.action_mask.cpu().numpy(), edp.logged_rewards.cpu().numpy().flatten(), edp.logged_propensities.cpu().numpy().flatten(), edp.model_propensities.cpu().numpy(), edp.model_values.cpu().numpy(), ) num_trajectories = actions.shape[0] trajectory_length = actions.shape[1] j_steps = [float("inf")] if num_j_steps > 1: j_steps.append(-1) if num_j_steps > 2: interval = trajectory_length // (num_j_steps - 1) j_steps.extend([i * interval for i in range(1, num_j_steps - 1)]) target_propensity_for_logged_action = np.sum( np.multiply(target_propensities, actions), axis=2 ) estimated_q_values_for_logged_action = np.sum( np.multiply(estimated_q_values, actions), axis=2 ) estimated_state_values = np.sum( np.multiply(target_propensities, estimated_q_values), axis=2 ) importance_weights = target_propensity_for_logged_action / logged_propensities importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights ) importance_weights_one_earlier = ( np.ones([num_trajectories, 1]) * 1.0 / num_trajectories ) importance_weights_one_earlier = np.hstack( [importance_weights_one_earlier, importance_weights[:, :-1]] ) discounts = np.logspace( start=0, stop=trajectory_length - 1, num=trajectory_length, base=self.gamma ) j_step_return_trajectories = [] for j_step in j_steps: j_step_return_trajectories.append( WeightedSequentialDoublyRobustEstimator.calculate_step_return( rewards, discounts, importance_weights, importance_weights_one_earlier, estimated_state_values, estimated_q_values_for_logged_action, j_step, ) ) j_step_return_trajectories = np.array( # type: ignore j_step_return_trajectories ) # type: ignore j_step_returns = np.sum(j_step_return_trajectories, axis=1) if len(j_step_returns) == 1: weighted_doubly_robust = j_step_returns[0] weighted_doubly_robust_std_error = 0.0 else: # break trajectories into several subsets to estimate confidence bounds infinite_step_returns = [] num_subsets = int( min( num_trajectories / 2, WeightedSequentialDoublyRobustEstimator.NUM_SUBSETS_FOR_CB_ESTIMATES, ) ) interval = num_trajectories / num_subsets for i in range(num_subsets): trajectory_subset = np.arange( int(i * interval), int((i + 1) * interval) ) importance_weights = ( target_propensity_for_logged_action[trajectory_subset] / logged_propensities[trajectory_subset] ) importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights ) importance_weights_one_earlier = ( np.ones([len(trajectory_subset), 1]) * 1.0 / len(trajectory_subset) ) importance_weights_one_earlier = np.hstack( [importance_weights_one_earlier, importance_weights[:, :-1]] ) infinite_step_return = np.sum( WeightedSequentialDoublyRobustEstimator.calculate_step_return( rewards[trajectory_subset], discounts, importance_weights, importance_weights_one_earlier, estimated_state_values[trajectory_subset], estimated_q_values_for_logged_action[trajectory_subset], float("inf"), ) ) infinite_step_returns.append(infinite_step_return) # Compute weighted_doubly_robust mean point estimate using all data weighted_doubly_robust = self.compute_weighted_doubly_robust_point_estimate( j_steps, num_j_steps, j_step_returns, infinite_step_returns, j_step_return_trajectories, ) # Use bootstrapping to compute weighted_doubly_robust standard error bootstrapped_means = [] sample_size = int( WeightedSequentialDoublyRobustEstimator.BOOTSTRAP_SAMPLE_PCT * num_subsets ) for _ in range( WeightedSequentialDoublyRobustEstimator.NUM_BOOTSTRAP_SAMPLES ): random_idxs = np.random.choice(num_j_steps, sample_size, replace=False) random_idxs.sort() wdr_estimate = self.compute_weighted_doubly_robust_point_estimate( j_steps=[j_steps[i] for i in random_idxs], num_j_steps=sample_size, j_step_returns=j_step_returns[random_idxs], infinite_step_returns=infinite_step_returns, j_step_return_trajectories=j_step_return_trajectories[ # type: ignore random_idxs ], # type: ignore ) bootstrapped_means.append(wdr_estimate) weighted_doubly_robust_std_error = np.std(bootstrapped_means) episode_values = np.sum(np.multiply(rewards, discounts), axis=1) logged_policy_score = np.nanmean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize WSDR-CPE because of small or negative logged_policy_score" ) return CpeEstimate( raw=weighted_doubly_robust, normalized=0.0, raw_std_error=weighted_doubly_robust_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=weighted_doubly_robust, normalized=weighted_doubly_robust / logged_policy_score, raw_std_error=weighted_doubly_robust_std_error, normalized_std_error=weighted_doubly_robust_std_error / logged_policy_score, )
def estimate( self, edp: EvaluationDataPage, num_j_steps, whether_self_normalize_importance_weights, ) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8 ( actions, rewards, logged_propensities, target_propensities, estimated_q_values, ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories( edp.mdp_id, edp.action_mask.cpu().numpy(), edp.logged_rewards.cpu().numpy().flatten(), edp.logged_propensities.cpu().numpy().flatten(), edp.model_propensities.cpu().numpy(), edp.model_values.cpu().numpy(), ) num_trajectories = actions.shape[0] trajectory_length = actions.shape[1] j_steps = [float("inf")] if num_j_steps > 1: j_steps.append(-1) if num_j_steps > 2: interval = trajectory_length // (num_j_steps - 1) j_steps.extend([i * interval for i in range(1, num_j_steps - 1)]) target_propensity_for_logged_action = np.sum( np.multiply(target_propensities, actions), axis=2 ) estimated_q_values_for_logged_action = np.sum( np.multiply(estimated_q_values, actions), axis=2 ) estimated_state_values = np.sum( np.multiply(target_propensities, estimated_q_values), axis=2 ) importance_weights = target_propensity_for_logged_action / logged_propensities importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights ) importance_weights_one_earlier = ( np.ones([num_trajectories, 1]) * 1.0 / num_trajectories ) importance_weights_one_earlier = np.hstack( [importance_weights_one_earlier, importance_weights[:, :-1]] ) discounts = np.logspace( start=0, stop=trajectory_length - 1, num=trajectory_length, base=self.gamma ) j_step_return_trajectories = [] for j_step in j_steps: j_step_return_trajectories.append( WeightedSequentialDoublyRobustEstimator.calculate_step_return( rewards, discounts, importance_weights, importance_weights_one_earlier, estimated_state_values, estimated_q_values_for_logged_action, j_step, ) ) j_step_return_trajectories = np.array(j_step_return_trajectories) j_step_returns = np.sum(j_step_return_trajectories, axis=1) if len(j_step_returns) == 1: weighted_doubly_robust = j_step_returns[0] else: # break trajectories into several subsets to estimate confidence bounds infinite_step_returns = [] num_subsets = int( min( num_trajectories / 2, WeightedSequentialDoublyRobustEstimator.NUM_SUBSETS_FOR_CB_ESTIMATES, ) ) interval = num_trajectories / num_subsets for i in range(num_subsets): trajectory_subset = np.arange( int(i * interval), int((i + 1) * interval) ) importance_weights = ( target_propensity_for_logged_action[trajectory_subset] / logged_propensities[trajectory_subset] ) importance_weights = np.cumprod(importance_weights, axis=1) importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights( importance_weights, whether_self_normalize_importance_weights ) importance_weights_one_earlier = ( np.ones([len(trajectory_subset), 1]) * 1.0 / len(trajectory_subset) ) importance_weights_one_earlier = np.hstack( [importance_weights_one_earlier, importance_weights[:, :-1]] ) infinite_step_return = np.sum( WeightedSequentialDoublyRobustEstimator.calculate_step_return( rewards[trajectory_subset], discounts, importance_weights, importance_weights_one_earlier, estimated_state_values[trajectory_subset], estimated_q_values_for_logged_action[trajectory_subset], float("inf"), ) ) infinite_step_returns.append(infinite_step_return) low_bound, high_bound = WeightedSequentialDoublyRobustEstimator.confidence_bounds( infinite_step_returns, WeightedSequentialDoublyRobustEstimator.CONFIDENCE_INTERVAL, ) # decompose error into bias + variance j_step_bias = np.zeros([num_j_steps]) where_lower = np.where(j_step_returns < low_bound)[0] j_step_bias[where_lower] = low_bound - j_step_returns[where_lower] where_higher = np.where(j_step_returns > high_bound)[0] j_step_bias[where_higher] = j_step_returns[where_higher] - high_bound covariance = np.cov(j_step_return_trajectories) error = covariance + j_step_bias.T * j_step_bias # minimize mse error def mse_loss(x, error): return np.dot(np.dot(x, error), x.T) constraint = {"type": "eq", "fun": lambda x: np.sum(x) - 1.0} x = np.zeros([len(j_steps)]) res = sp.optimize.minimize( mse_loss, x, args=error, constraints=constraint, bounds=[(0, 1) for _ in range(x.shape[0])], ) x = np.array(res.x) weighted_doubly_robust = float(np.dot(x, j_step_returns)) episode_values = np.sum(np.multiply(rewards, discounts), axis=1) denominator = np.nanmean(episode_values) if abs(denominator) < 1e-6: return CpeEstimate(raw=0.0, normalized=0.0) return CpeEstimate( raw=weighted_doubly_robust, normalized=weighted_doubly_robust / denominator )
def estimate( self, edp: EvaluationDataPage ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]: # The score we would get if we evaluate the logged policy against itself logged_policy_score = float(torch.mean(edp.logged_rewards)) if logged_policy_score < 1e-6: logger.warning( "Can't normalize DR-CPE because of small or negative logged_policy_score" ) normalizer = 0.0 else: normalizer = 1.0 / logged_policy_score # For details, visit https://arxiv.org/pdf/1612.01205.pdf num_examples = edp.model_propensities.shape[0] if edp.model_rewards is None: # Fill with zero, equivalent to just doing IPS model_rewards = torch.zeros(edp.model_propensities.shape).float() direct_method_values = torch.zeros([num_examples, 1], dtype=torch.float32) else: model_rewards = edp.model_rewards direct_method_values = torch.sum( edp.model_propensities * model_rewards, dim=1, keepdim=True ) direct_method_score = float(torch.mean(direct_method_values)) direct_method_std_error = bootstrapped_std_error_of_mean( direct_method_values.squeeze() ) direct_method_estimate = CpeEstimate( raw=direct_method_score, normalized=direct_method_score * normalizer, raw_std_error=direct_method_std_error, normalized_std_error=direct_method_std_error * normalizer, ) target_propensity_for_action = torch.sum( edp.model_propensities * edp.action_mask, dim=1, keepdim=True ) importance_weight = ( target_propensity_for_action / edp.logged_propensities ).float() ips = importance_weight * edp.logged_rewards doubly_robust = ( importance_weight * (edp.logged_rewards - edp.model_rewards_for_logged_action) ) + direct_method_values ips_score = float(torch.mean(ips)) ips_score_std_error = bootstrapped_std_error_of_mean(ips.squeeze()) inverse_propensity_estimate = CpeEstimate( raw=ips_score, normalized=ips_score * normalizer, raw_std_error=ips_score_std_error, normalized_std_error=ips_score_std_error * normalizer, ) dr_score = float(torch.mean(doubly_robust)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robust.squeeze()) doubly_robust_estimate = CpeEstimate( raw=dr_score, normalized=dr_score * normalizer, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error * normalizer, ) return ( direct_method_estimate, inverse_propensity_estimate, doubly_robust_estimate, )