Exemple #1
0
    def _get_importance_sampling_estimates(
        self, isd: ImportanceSamplingData, hp: DoublyRobustHP
    ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]:
        # The score we would get if we evaluate the logged policy against itself
        logged_policy_score = float(
            torch.mean(isd.logged_rewards)
        )  # logged_rewards is N*1 tensor of historical rewards
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize DR-CPE because of small or negative "
                + "logged_policy_score"
            )
            normalizer = 0.0
        else:
            normalizer = 1.0 / logged_policy_score

        if isd.model_rewards is None:
            # Fill with zero, equivalent to just doing IPS
            direct_method_values = torch.zeros(
                [isd.model_propensities.shape[0], 1], dtype=torch.float32
            )
        else:
            # model rewards is (N_samples)*N*N_actions tensor of predicted
            # counterfactual rewards for each possible action at each
            # historical context
            direct_method_values = torch.sum(
                isd.model_propensities * isd.model_rewards, dim=1, keepdim=True
            )

        direct_method_score = float(torch.mean(direct_method_values))
        direct_method_std_error = bootstrapped_std_error_of_mean(
            direct_method_values.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        direct_method_estimate = CpeEstimate(
            raw=direct_method_score,
            normalized=direct_method_score * normalizer,
            raw_std_error=direct_method_std_error,
            normalized_std_error=direct_method_std_error * normalizer,
        )

        ips = isd.importance_weight * isd.logged_rewards  # N*1

        doubly_robust = (
            isd.importance_weight
            * (isd.logged_rewards - isd.model_rewards_for_logged_action)
        ) + direct_method_values
        # model_rewards_for_logged_action is N*1 of estimated rewards for target
        # policy

        ips_score = float(torch.mean(ips))
        ips_score_std_error = bootstrapped_std_error_of_mean(
            ips.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        inverse_propensity_estimate = CpeEstimate(
            raw=ips_score,
            normalized=ips_score * normalizer,
            raw_std_error=ips_score_std_error,
            normalized_std_error=ips_score_std_error * normalizer,
        )

        dr_score = float(torch.mean(doubly_robust))
        dr_score_std_error = bootstrapped_std_error_of_mean(
            doubly_robust.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        doubly_robust_estimate = CpeEstimate(
            raw=dr_score,
            normalized=dr_score * normalizer,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error * normalizer,
        )

        return (
            direct_method_estimate,
            inverse_propensity_estimate,
            doubly_robust_estimate,
        )
Exemple #2
0
    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(edp.model_propensities *
                                           edp.model_values,
                                           dim=1)

        estimated_q_values_for_logged_action = torch.sum(edp.model_values *
                                                         edp.action_mask,
                                                         dim=1)

        target_propensity_for_action = torch.sum(edp.model_propensities *
                                                 edp.action_mask,
                                                 dim=1)

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_propensities.shape))
        assert (target_propensity_for_action.shape ==
                estimated_q_values_for_logged_action.shape), (
                    "Invalid shape: " +
                    str(target_propensity_for_action.shape) + " != " +
                    str(estimated_q_values_for_logged_action.shape))
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_rewards.shape))
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[
                        j] + importance_weight[j] * (
                            logged_rewards[j] + self.gamma * doubly_robust -
                            estimated_q_values_for_logged_action[j])
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]
                if episode_value > 1e-6 or episode_value < -1e-6:
                    doubly_robusts.append(float(doubly_robust))
                    episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        doubly_robusts = np.array(doubly_robusts)
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        episode_values = np.array(episode_values)
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small or negative logged_policy_score"
            )
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )
    def estimate(
        self,
        edp: EvaluationDataPage,
        num_j_steps,
        whether_self_normalize_importance_weights,
    ) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8
        (
            actions,
            rewards,
            logged_propensities,
            target_propensities,
            estimated_q_values,
        ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories(
            edp.mdp_id,
            edp.action_mask.cpu().numpy(),
            edp.logged_rewards.cpu().numpy().flatten(),
            edp.logged_propensities.cpu().numpy().flatten(),
            edp.model_propensities.cpu().numpy(),
            edp.model_values.cpu().numpy(),
        )

        num_trajectories = actions.shape[0]
        trajectory_length = actions.shape[1]

        j_steps = [float("inf")]

        if num_j_steps > 1:
            j_steps.append(-1)
        if num_j_steps > 2:
            interval = trajectory_length // (num_j_steps - 1)
            j_steps.extend([i * interval for i in range(1, num_j_steps - 1)])

        target_propensity_for_logged_action = np.sum(
            np.multiply(target_propensities, actions), axis=2
        )
        estimated_q_values_for_logged_action = np.sum(
            np.multiply(estimated_q_values, actions), axis=2
        )
        estimated_state_values = np.sum(
            np.multiply(target_propensities, estimated_q_values), axis=2
        )

        importance_weights = target_propensity_for_logged_action / logged_propensities
        importance_weights = np.cumprod(importance_weights, axis=1)
        importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
            importance_weights, whether_self_normalize_importance_weights
        )

        importance_weights_one_earlier = (
            np.ones([num_trajectories, 1]) * 1.0 / num_trajectories
        )
        importance_weights_one_earlier = np.hstack(
            [importance_weights_one_earlier, importance_weights[:, :-1]]
        )

        discounts = np.logspace(
            start=0, stop=trajectory_length - 1, num=trajectory_length, base=self.gamma
        )

        j_step_return_trajectories = []
        for j_step in j_steps:
            j_step_return_trajectories.append(
                WeightedSequentialDoublyRobustEstimator.calculate_step_return(
                    rewards,
                    discounts,
                    importance_weights,
                    importance_weights_one_earlier,
                    estimated_state_values,
                    estimated_q_values_for_logged_action,
                    j_step,
                )
            )
        j_step_return_trajectories = np.array(  # type: ignore
            j_step_return_trajectories
        )  # type: ignore

        j_step_returns = np.sum(j_step_return_trajectories, axis=1)

        if len(j_step_returns) == 1:
            weighted_doubly_robust = j_step_returns[0]
            weighted_doubly_robust_std_error = 0.0
        else:
            # break trajectories into several subsets to estimate confidence bounds
            infinite_step_returns = []
            num_subsets = int(
                min(
                    num_trajectories / 2,
                    WeightedSequentialDoublyRobustEstimator.NUM_SUBSETS_FOR_CB_ESTIMATES,
                )
            )
            interval = num_trajectories / num_subsets
            for i in range(num_subsets):
                trajectory_subset = np.arange(
                    int(i * interval), int((i + 1) * interval)
                )
                importance_weights = (
                    target_propensity_for_logged_action[trajectory_subset]
                    / logged_propensities[trajectory_subset]
                )
                importance_weights = np.cumprod(importance_weights, axis=1)
                importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
                    importance_weights, whether_self_normalize_importance_weights
                )
                importance_weights_one_earlier = (
                    np.ones([len(trajectory_subset), 1]) * 1.0 / len(trajectory_subset)
                )
                importance_weights_one_earlier = np.hstack(
                    [importance_weights_one_earlier, importance_weights[:, :-1]]
                )
                infinite_step_return = np.sum(
                    WeightedSequentialDoublyRobustEstimator.calculate_step_return(
                        rewards[trajectory_subset],
                        discounts,
                        importance_weights,
                        importance_weights_one_earlier,
                        estimated_state_values[trajectory_subset],
                        estimated_q_values_for_logged_action[trajectory_subset],
                        float("inf"),
                    )
                )
                infinite_step_returns.append(infinite_step_return)

            # Compute weighted_doubly_robust mean point estimate using all data
            weighted_doubly_robust = self.compute_weighted_doubly_robust_point_estimate(
                j_steps,
                num_j_steps,
                j_step_returns,
                infinite_step_returns,
                j_step_return_trajectories,
            )

            # Use bootstrapping to compute weighted_doubly_robust standard error
            bootstrapped_means = []
            sample_size = int(
                WeightedSequentialDoublyRobustEstimator.BOOTSTRAP_SAMPLE_PCT
                * num_subsets
            )
            for _ in range(
                WeightedSequentialDoublyRobustEstimator.NUM_BOOTSTRAP_SAMPLES
            ):
                random_idxs = np.random.choice(num_j_steps, sample_size, replace=False)
                random_idxs.sort()
                wdr_estimate = self.compute_weighted_doubly_robust_point_estimate(
                    j_steps=[j_steps[i] for i in random_idxs],
                    num_j_steps=sample_size,
                    j_step_returns=j_step_returns[random_idxs],
                    infinite_step_returns=infinite_step_returns,
                    j_step_return_trajectories=j_step_return_trajectories[  # type: ignore
                        random_idxs
                    ],  # type: ignore
                )
                bootstrapped_means.append(wdr_estimate)
            weighted_doubly_robust_std_error = np.std(bootstrapped_means)

        episode_values = np.sum(np.multiply(rewards, discounts), axis=1)
        logged_policy_score = np.nanmean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize WSDR-CPE because of small or negative logged_policy_score"
            )
            return CpeEstimate(
                raw=weighted_doubly_robust,
                normalized=0.0,
                raw_std_error=weighted_doubly_robust_std_error,
                normalized_std_error=0.0,
            )

        return CpeEstimate(
            raw=weighted_doubly_robust,
            normalized=weighted_doubly_robust / logged_policy_score,
            raw_std_error=weighted_doubly_robust_std_error,
            normalized_std_error=weighted_doubly_robust_std_error / logged_policy_score,
        )
Exemple #4
0
    def estimate(
        self,
        edp: EvaluationDataPage,
        num_j_steps,
        whether_self_normalize_importance_weights,
    ) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1604.00923.pdf Section 5, 7, 8
        (
            actions,
            rewards,
            logged_propensities,
            target_propensities,
            estimated_q_values,
        ) = WeightedSequentialDoublyRobustEstimator.transform_to_equal_length_trajectories(
            edp.mdp_id,
            edp.action_mask.cpu().numpy(),
            edp.logged_rewards.cpu().numpy().flatten(),
            edp.logged_propensities.cpu().numpy().flatten(),
            edp.model_propensities.cpu().numpy(),
            edp.model_values.cpu().numpy(),
        )

        num_trajectories = actions.shape[0]
        trajectory_length = actions.shape[1]

        j_steps = [float("inf")]

        if num_j_steps > 1:
            j_steps.append(-1)
        if num_j_steps > 2:
            interval = trajectory_length // (num_j_steps - 1)
            j_steps.extend([i * interval for i in range(1, num_j_steps - 1)])

        target_propensity_for_logged_action = np.sum(
            np.multiply(target_propensities, actions), axis=2
        )
        estimated_q_values_for_logged_action = np.sum(
            np.multiply(estimated_q_values, actions), axis=2
        )
        estimated_state_values = np.sum(
            np.multiply(target_propensities, estimated_q_values), axis=2
        )

        importance_weights = target_propensity_for_logged_action / logged_propensities
        importance_weights = np.cumprod(importance_weights, axis=1)
        importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
            importance_weights, whether_self_normalize_importance_weights
        )

        importance_weights_one_earlier = (
            np.ones([num_trajectories, 1]) * 1.0 / num_trajectories
        )
        importance_weights_one_earlier = np.hstack(
            [importance_weights_one_earlier, importance_weights[:, :-1]]
        )

        discounts = np.logspace(
            start=0, stop=trajectory_length - 1, num=trajectory_length, base=self.gamma
        )

        j_step_return_trajectories = []
        for j_step in j_steps:
            j_step_return_trajectories.append(
                WeightedSequentialDoublyRobustEstimator.calculate_step_return(
                    rewards,
                    discounts,
                    importance_weights,
                    importance_weights_one_earlier,
                    estimated_state_values,
                    estimated_q_values_for_logged_action,
                    j_step,
                )
            )
        j_step_return_trajectories = np.array(j_step_return_trajectories)

        j_step_returns = np.sum(j_step_return_trajectories, axis=1)

        if len(j_step_returns) == 1:
            weighted_doubly_robust = j_step_returns[0]

        else:
            # break trajectories into several subsets to estimate confidence bounds
            infinite_step_returns = []
            num_subsets = int(
                min(
                    num_trajectories / 2,
                    WeightedSequentialDoublyRobustEstimator.NUM_SUBSETS_FOR_CB_ESTIMATES,
                )
            )
            interval = num_trajectories / num_subsets
            for i in range(num_subsets):
                trajectory_subset = np.arange(
                    int(i * interval), int((i + 1) * interval)
                )
                importance_weights = (
                    target_propensity_for_logged_action[trajectory_subset]
                    / logged_propensities[trajectory_subset]
                )
                importance_weights = np.cumprod(importance_weights, axis=1)
                importance_weights = WeightedSequentialDoublyRobustEstimator.normalize_importance_weights(
                    importance_weights, whether_self_normalize_importance_weights
                )
                importance_weights_one_earlier = (
                    np.ones([len(trajectory_subset), 1]) * 1.0 / len(trajectory_subset)
                )
                importance_weights_one_earlier = np.hstack(
                    [importance_weights_one_earlier, importance_weights[:, :-1]]
                )
                infinite_step_return = np.sum(
                    WeightedSequentialDoublyRobustEstimator.calculate_step_return(
                        rewards[trajectory_subset],
                        discounts,
                        importance_weights,
                        importance_weights_one_earlier,
                        estimated_state_values[trajectory_subset],
                        estimated_q_values_for_logged_action[trajectory_subset],
                        float("inf"),
                    )
                )
                infinite_step_returns.append(infinite_step_return)

            low_bound, high_bound = WeightedSequentialDoublyRobustEstimator.confidence_bounds(
                infinite_step_returns,
                WeightedSequentialDoublyRobustEstimator.CONFIDENCE_INTERVAL,
            )

            # decompose error into bias + variance
            j_step_bias = np.zeros([num_j_steps])
            where_lower = np.where(j_step_returns < low_bound)[0]
            j_step_bias[where_lower] = low_bound - j_step_returns[where_lower]
            where_higher = np.where(j_step_returns > high_bound)[0]
            j_step_bias[where_higher] = j_step_returns[where_higher] - high_bound

            covariance = np.cov(j_step_return_trajectories)

            error = covariance + j_step_bias.T * j_step_bias

            # minimize mse error
            def mse_loss(x, error):
                return np.dot(np.dot(x, error), x.T)

            constraint = {"type": "eq", "fun": lambda x: np.sum(x) - 1.0}

            x = np.zeros([len(j_steps)])
            res = sp.optimize.minimize(
                mse_loss,
                x,
                args=error,
                constraints=constraint,
                bounds=[(0, 1) for _ in range(x.shape[0])],
            )
            x = np.array(res.x)

            weighted_doubly_robust = float(np.dot(x, j_step_returns))

        episode_values = np.sum(np.multiply(rewards, discounts), axis=1)

        denominator = np.nanmean(episode_values)
        if abs(denominator) < 1e-6:
            return CpeEstimate(raw=0.0, normalized=0.0)

        return CpeEstimate(
            raw=weighted_doubly_robust, normalized=weighted_doubly_robust / denominator
        )
    def estimate(
        self, edp: EvaluationDataPage
    ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]:
        # The score we would get if we evaluate the logged policy against itself
        logged_policy_score = float(torch.mean(edp.logged_rewards))
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize DR-CPE because of small or negative logged_policy_score"
            )
            normalizer = 0.0
        else:
            normalizer = 1.0 / logged_policy_score

        # For details, visit https://arxiv.org/pdf/1612.01205.pdf
        num_examples = edp.model_propensities.shape[0]

        if edp.model_rewards is None:
            # Fill with zero, equivalent to just doing IPS
            model_rewards = torch.zeros(edp.model_propensities.shape).float()
            direct_method_values = torch.zeros([num_examples, 1], dtype=torch.float32)
        else:
            model_rewards = edp.model_rewards
            direct_method_values = torch.sum(
                edp.model_propensities * model_rewards, dim=1, keepdim=True
            )

        direct_method_score = float(torch.mean(direct_method_values))
        direct_method_std_error = bootstrapped_std_error_of_mean(
            direct_method_values.squeeze()
        )
        direct_method_estimate = CpeEstimate(
            raw=direct_method_score,
            normalized=direct_method_score * normalizer,
            raw_std_error=direct_method_std_error,
            normalized_std_error=direct_method_std_error * normalizer,
        )

        target_propensity_for_action = torch.sum(
            edp.model_propensities * edp.action_mask, dim=1, keepdim=True
        )

        importance_weight = (
            target_propensity_for_action / edp.logged_propensities
        ).float()

        ips = importance_weight * edp.logged_rewards

        doubly_robust = (
            importance_weight
            * (edp.logged_rewards - edp.model_rewards_for_logged_action)
        ) + direct_method_values

        ips_score = float(torch.mean(ips))
        ips_score_std_error = bootstrapped_std_error_of_mean(ips.squeeze())
        inverse_propensity_estimate = CpeEstimate(
            raw=ips_score,
            normalized=ips_score * normalizer,
            raw_std_error=ips_score_std_error,
            normalized_std_error=ips_score_std_error * normalizer,
        )

        dr_score = float(torch.mean(doubly_robust))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robust.squeeze())
        doubly_robust_estimate = CpeEstimate(
            raw=dr_score,
            normalized=dr_score * normalizer,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error * normalizer,
        )

        return (
            direct_method_estimate,
            inverse_propensity_estimate,
            doubly_robust_estimate,
        )