コード例 #1
0
ファイル: ope_adapter.py プロジェクト: vishalbelsare/Horizon
    def estimator_results_to_cpe_estimate(
        estimator_results: EstimatorResults, ) -> CpeEstimate:
        scores = torch.tensor(
            [r.estimated_reward for r in estimator_results.results],
            dtype=torch.double)
        log_scores = torch.tensor(
            [r.log_reward for r in estimator_results.results],
            dtype=torch.double)

        dr_score = float(torch.mean(scores).item())
        dr_score_std_error = bootstrapped_std_error_of_mean(scores)

        log_score = float(torch.mean(log_scores).item())
        if log_score < 1e-6:
            logger.warning("Can't normalize SDR-CPE because of small"
                           f" or negative logged_policy_score ({log_score})."
                           f"Episode values: {log_scores}.")
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / log_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / log_score,
        )
コード例 #2
0
ファイル: ope_adapter.py プロジェクト: vishalbelsare/Horizon
 def estimator_result_to_cpe_estimate(
         result: EstimatorResult) -> CpeEstimate:
     assert result.estimated_reward_normalized is not None
     assert result.estimated_reward_normalized is not None
     assert result.estimated_reward_std_error is not None
     assert result.estimated_reward_normalized_std_error is not None
     return CpeEstimate(
         raw=result.estimated_reward,
         normalized=result.estimated_reward_normalized,
         raw_std_error=result.estimated_reward_std_error,
         normalized_std_error=result.estimated_reward_normalized_std_error,
     )
コード例 #3
0
    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(edp.model_propensities *
                                           edp.model_values,
                                           dim=1)

        estimated_q_values_for_logged_action = torch.sum(edp.model_values *
                                                         edp.action_mask,
                                                         dim=1)

        target_propensity_for_action = torch.sum(edp.model_propensities *
                                                 edp.action_mask,
                                                 dim=1)

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_propensities.shape))
        assert (target_propensity_for_action.shape ==
                estimated_q_values_for_logged_action.shape), (
                    "Invalid shape: " +
                    str(target_propensity_for_action.shape) + " != " +
                    str(estimated_q_values_for_logged_action.shape))
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_rewards.shape))
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        assert edp.mdp_id is not None
        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[
                        j] + importance_weight[j] * (
                            logged_rewards[j] + self.gamma * doubly_robust -
                            estimated_q_values_for_logged_action[j])
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]
                if episode_value > 1e-6 or episode_value < -1e-6:
                    doubly_robusts.append(float(doubly_robust))
                    episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        assert len(doubly_robusts) > 0, (
            f"No valid doubly robusts data is generated. "
            f"Logged_rewards={logged_rewards}, importance_weight={importance_weight},"
            f" estimated_q_values_for_logged_action={estimated_q_values_for_logged_action}"
            f" estimated_state_values={estimated_state_values}, gamma={self.gamma}"
            f" Did you specify wrong metric names?")

        doubly_robusts = np.array(doubly_robusts)  # type: ignore
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        episode_values = np.array(episode_values)  # type: ignore
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small or negative logged_policy_score"
            )
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )
コード例 #4
0
    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(edp.model_propensities *
                                           edp.model_values,
                                           dim=1)

        estimated_q_values_for_logged_action = torch.sum(edp.model_values *
                                                         edp.action_mask,
                                                         dim=1)

        target_propensity_for_action = torch.sum(edp.model_propensities *
                                                 edp.action_mask,
                                                 dim=1)

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_propensities.shape))
        assert (target_propensity_for_action.shape ==
                estimated_q_values_for_logged_action.shape), (
                    "Invalid shape: " +
                    str(target_propensity_for_action.shape) + " != " +
                    str(estimated_q_values_for_logged_action.shape))
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_rewards.shape))
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        assert edp.mdp_id is not None
        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            # pyre-ignore [16]: Optional type has no attribute `__getitem__`
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[
                        j] + importance_weight[j] * (
                            logged_rewards[j] + self.gamma * doubly_robust -
                            estimated_q_values_for_logged_action[j])
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]

                doubly_robusts.append(float(doubly_robust))
                episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        if len(doubly_robusts) == 0:
            torch.set_printoptions(profile="full")
            zipped_data = list(
                zip(*map(
                    lambda x: x.tolist(),
                    [
                        edp.mdp_id,
                        logged_rewards,
                        estimated_state_values,
                        estimated_q_values_for_logged_action,
                        importance_weight,
                    ],
                )))
            raise RuntimeError(
                f"No valid doubly robusts data is generated.\n"
                f"mdp_ids x logged_rewards x estimated_state_values x "
                f"estimated_q_values_for_logged_action x importance_weight:\n"
                f"{zipped_data};\n"
                f"gamma={self.gamma};\n"
                f"Did you specify wrong metric names?")

        # pyre-fixme[9]: doubly_robusts has type `List[float]`; used as `ndarray`.
        doubly_robusts = np.array(doubly_robusts)
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        # pyre-fixme[9]: episode_values has type `List[float]`; used as `ndarray`.
        episode_values = np.array(episode_values)
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small"
                f" or negative logged_policy_score ({logged_policy_score})."
                f"Episode values: {episode_values}.")
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )
コード例 #5
0
    def estimate(
        self,
        edp: EvaluationDataPage,
        num_j_steps,
        whether_self_normalize_importance_weights,
    ) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8
        assert edp.model_values is not None
        (
            actions,
            rewards,
            logged_propensities,
            target_propensities,
            estimated_q_values,
        ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories(
            edp.mdp_id,
            edp.action_mask.cpu().numpy(),
            edp.logged_rewards.cpu().numpy().flatten(),
            edp.logged_propensities.cpu().numpy().flatten(),
            edp.model_propensities.cpu().numpy(),
            # pyre-fixme[16]: Optional type has no attribute `cpu`.
            edp.model_values.cpu().numpy(),
        )

        num_trajectories = actions.shape[0]
        trajectory_length = actions.shape[1]

        j_steps = [float("inf")]

        if num_j_steps > 1:
            j_steps.append(-1)
        if num_j_steps > 2:
            interval = trajectory_length // (num_j_steps - 1)
            j_steps.extend([i * interval for i in range(1, num_j_steps - 1)])

        target_propensity_for_logged_action = np.sum(np.multiply(
            target_propensities, actions),
                                                     axis=2)
        estimated_q_values_for_logged_action = np.sum(np.multiply(
            estimated_q_values, actions),
                                                      axis=2)
        estimated_state_values = np.sum(np.multiply(target_propensities,
                                                    estimated_q_values),
                                        axis=2)

        importance_weights = target_propensity_for_logged_action / logged_propensities
        importance_weights = np.cumprod(importance_weights, axis=1)
        importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
            importance_weights, whether_self_normalize_importance_weights)

        importance_weights_one_earlier = (np.ones([num_trajectories, 1]) *
                                          1.0 / num_trajectories)
        importance_weights_one_earlier = np.hstack(
            [importance_weights_one_earlier, importance_weights[:, :-1]])

        discounts = np.logspace(start=0,
                                stop=trajectory_length - 1,
                                num=trajectory_length,
                                base=self.gamma)

        j_step_return_trajectories = []
        for j_step in j_steps:
            j_step_return_trajectories.append(
                WeightedSequentialDoublyRobustEstimator.calculate_step_return(
                    rewards,
                    discounts,
                    importance_weights,
                    importance_weights_one_earlier,
                    estimated_state_values,
                    estimated_q_values_for_logged_action,
                    j_step,
                ))
        j_step_return_trajectories = np.array(j_step_return_trajectories)

        j_step_returns = np.sum(j_step_return_trajectories, axis=1)

        if len(j_step_returns) == 1:
            weighted_doubly_robust = j_step_returns[0]
            weighted_doubly_robust_std_error = 0.0
        else:
            # break trajectories into several subsets to estimate confidence bounds
            infinite_step_returns = []
            num_subsets = int(
                min(
                    num_trajectories / 2,
                    WeightedSequentialDoublyRobustEstimator.
                    NUM_SUBSETS_FOR_CB_ESTIMATES,
                ))
            interval = num_trajectories / num_subsets
            for i in range(num_subsets):
                trajectory_subset = np.arange(int(i * interval),
                                              int((i + 1) * interval))
                importance_weights = (
                    target_propensity_for_logged_action[trajectory_subset] /
                    logged_propensities[trajectory_subset])
                importance_weights = np.cumprod(importance_weights, axis=1)
                importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
                    importance_weights,
                    whether_self_normalize_importance_weights)
                importance_weights_one_earlier = (
                    np.ones([len(trajectory_subset), 1]) * 1.0 /
                    len(trajectory_subset))
                importance_weights_one_earlier = np.hstack([
                    importance_weights_one_earlier, importance_weights[:, :-1]
                ])
                infinite_step_return = np.sum(
                    WeightedSequentialDoublyRobustEstimator.
                    calculate_step_return(
                        rewards[trajectory_subset],
                        discounts,
                        importance_weights,
                        importance_weights_one_earlier,
                        estimated_state_values[trajectory_subset],
                        estimated_q_values_for_logged_action[
                            trajectory_subset],
                        float("inf"),
                    ))
                infinite_step_returns.append(infinite_step_return)

            # Compute weighted_doubly_robust mean point estimate using all data
            weighted_doubly_robust = self.compute_weighted_doubly_robust_point_estimate(
                j_steps,
                num_j_steps,
                j_step_returns,
                infinite_step_returns,
                j_step_return_trajectories,
            )

            # Use bootstrapping to compute weighted_doubly_robust standard error
            bootstrapped_means = []
            sample_size = int(
                WeightedSequentialDoublyRobustEstimator.BOOTSTRAP_SAMPLE_PCT *
                num_subsets)
            for _ in range(WeightedSequentialDoublyRobustEstimator.
                           NUM_BOOTSTRAP_SAMPLES):
                random_idxs = np.random.choice(num_j_steps,
                                               sample_size,
                                               replace=False)
                random_idxs.sort()
                wdr_estimate = self.compute_weighted_doubly_robust_point_estimate(
                    j_steps=[j_steps[i] for i in random_idxs],
                    num_j_steps=sample_size,
                    j_step_returns=j_step_returns[random_idxs],
                    infinite_step_returns=infinite_step_returns,
                    j_step_return_trajectories=j_step_return_trajectories[
                        random_idxs],
                )
                bootstrapped_means.append(wdr_estimate)
            weighted_doubly_robust_std_error = np.std(bootstrapped_means)

        episode_values = np.sum(np.multiply(rewards, discounts), axis=1)
        logged_policy_score = np.nanmean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize WSDR-CPE because of small or negative logged_policy_score"
            )
            return CpeEstimate(
                raw=weighted_doubly_robust,
                normalized=0.0,
                raw_std_error=weighted_doubly_robust_std_error,
                normalized_std_error=0.0,
            )

        return CpeEstimate(
            raw=weighted_doubly_robust,
            normalized=weighted_doubly_robust / logged_policy_score,
            raw_std_error=weighted_doubly_robust_std_error,
            normalized_std_error=weighted_doubly_robust_std_error /
            logged_policy_score,
        )
コード例 #6
0
    def _get_importance_sampling_estimates(
            self, isd: ImportanceSamplingData, hp: DoublyRobustHP
    ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]:
        # The score we would get if we evaluate the logged policy against itself
        logged_policy_score = float(
            torch.mean(isd.logged_rewards
                       ))  # logged_rewards is N*1 tensor of historical rewards
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize DR-CPE because of small or negative " +
                "logged_policy_score")
            normalizer = 0.0
        else:
            normalizer = 1.0 / logged_policy_score

        if isd.model_rewards is None:
            # Fill with zero, equivalent to just doing IPS
            direct_method_values = torch.zeros(
                [isd.model_propensities.shape[0], 1], dtype=torch.float32)
        else:
            # model rewards is (N_samples)*N_actions tensor of predicted
            # counterfactual rewards for each possible action at each
            # historical context
            direct_method_values = torch.sum(isd.model_propensities *
                                             isd.model_rewards,
                                             dim=1,
                                             keepdim=True)

        direct_method_score = float(torch.mean(direct_method_values))
        direct_method_std_error = bootstrapped_std_error_of_mean(
            direct_method_values.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        direct_method_estimate = CpeEstimate(
            raw=direct_method_score,
            normalized=direct_method_score * normalizer,
            raw_std_error=direct_method_std_error,
            normalized_std_error=direct_method_std_error * normalizer,
        )

        ips = isd.importance_weight * isd.logged_rewards  # N*1

        doubly_robust = (
            isd.importance_weight *
            (isd.logged_rewards -
             isd.model_rewards_for_logged_action)) + direct_method_values
        # model_rewards_for_logged_action is N*1 of estimated rewards for target
        # policy

        ips_score = float(torch.mean(ips))
        ips_score_std_error = bootstrapped_std_error_of_mean(
            ips.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        inverse_propensity_estimate = CpeEstimate(
            raw=ips_score,
            normalized=ips_score * normalizer,
            raw_std_error=ips_score_std_error,
            normalized_std_error=ips_score_std_error * normalizer,
        )

        dr_score = float(torch.mean(doubly_robust))
        dr_score_std_error = bootstrapped_std_error_of_mean(
            doubly_robust.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        doubly_robust_estimate = CpeEstimate(
            raw=dr_score,
            normalized=dr_score * normalizer,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error * normalizer,
        )

        return (
            direct_method_estimate,
            inverse_propensity_estimate,
            doubly_robust_estimate,
        )