コード例 #1
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_rewards: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = (
            None,
            None,
            None,
        )
        if self.all_action_scores is not None:
            self.all_action_scores = self.all_action_scores.cpu().numpy()
            self.model_propensities = Evaluator.softmax(
                self.all_action_scores, self.rl_temperature)
            maxq_action_idxs = self.all_action_scores.argmax(axis=1)
            if logged_actions is not None:
                model_values_on_logged_actions = np.sum(
                    (logged_actions * self.all_action_scores),
                    axis=1,
                    keepdims=True)

        evaluator.report(
            self.loss.cpu().numpy(),
            logged_actions,
            logged_propensities,
            logged_rewards,
            logged_values,
            self.model_propensities,
            self.all_action_scores,
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
コード例 #2
0
 def policy(self, states):
     with core.DeviceScope(self.c2_device):
         if isinstance(self.trainer, DiscreteActionTrainer):
             workspace.FeedBlob("states", states)
         elif isinstance(self.trainer, ContinuousActionDQNTrainer):
             num_actions = len(self.trainer.action_normalization_parameters)
             actions = np.eye(num_actions, dtype=np.float32)
             actions = np.tile(actions, reps=(len(states), 1))
             states = np.repeat(states, repeats=num_actions, axis=0)
             workspace.FeedBlob("states", states)
             workspace.FeedBlob("actions", actions)
         else:
             raise NotImplementedError(
                 "Invalid trainer passed to GymPredictor")
         workspace.RunNetOnce(self.trainer.internal_policy_model.net)
         policy_output_blob = self.trainer.internal_policy_output
         q_scores = workspace.FetchBlob(policy_output_blob)
         if isinstance(self.trainer, DiscreteActionTrainer):
             assert q_scores.shape[0] == 1
             q_scores = q_scores[0]
         q_scores_softmax = Evaluator.softmax(q_scores.reshape(
             1, -1), self.trainer.rl_temperature)[0]
         if np.isnan(
                 q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
             q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
         policies = [
             np.argmax(q_scores),
             np.random.choice(q_scores.shape[0], p=q_scores_softmax),
         ]
         return policies
コード例 #3
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        workspace.RunNet(self.all_q_score_model.net)
        all_action_scores = workspace.FetchBlob(self.all_q_score_output)
        maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs)
        model_values_on_logged_actions = np.sum(
            (logged_actions * all_action_scores), axis=1, keepdims=True)
        model_propensities = Evaluator.softmax(all_action_scores,
                                               self.rl_temperature)
        logged_rewards = workspace.FetchBlob("rewards")

        evaluator.report(
            workspace.FetchBlob(self.loss_blob),
            logged_actions,
            logged_propensities,
            logged_rewards,
            logged_values,
            model_propensities,
            all_action_scores,
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
コード例 #4
0
ファイル: dqn_trainer.py プロジェクト: houseroad/Horizon
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: torch.Tensor,
        logged_propensities: Optional[torch.Tensor],
        logged_rewards: torch.Tensor,
        logged_values: Optional[torch.Tensor],
    ):
        self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = (
            None,
            None,
            None,
        )
        if self.all_action_scores is not None:
            self.all_action_scores = self.all_action_scores
            self.model_propensities = Evaluator.softmax(
                self.all_action_scores.cpu().numpy(), self.rl_temperature
            )
            maxq_action_idxs = self.all_action_scores.argmax(dim=1, keepdim=True)
            if logged_actions is not None:
                model_values_on_logged_actions = (
                    torch.sum(
                        (logged_actions * self.all_action_scores), dim=1, keepdim=True
                    )
                    .cpu()
                    .numpy()
                )

        evaluator.report(
            self.loss.cpu().numpy(),
            logged_actions.cpu().numpy(),
            logged_propensities.cpu().numpy()
            if logged_propensities is not None
            else None,
            logged_rewards.cpu().numpy(),
            logged_values.cpu().numpy() if logged_values is not None else None,
            self.model_propensities,
            self.reward_estimates.cpu().numpy(),
            self.all_action_scores.cpu().numpy(),
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
コード例 #5
0
ファイル: gym_predictor.py プロジェクト: wycharry/Horizon
 def policy(self, states):
     with core.DeviceScope(self.c2_device):
         if isinstance(self.trainer, DiscreteActionTrainer):
             workspace.FeedBlob("states", states)
         else:
             raise NotImplementedError(
                 "Invalid trainer passed to GymPredictor")
         workspace.RunNetOnce(self.trainer.internal_policy_model.net)
         policy_output_blob = self.trainer.internal_policy_output
         q_scores = workspace.FetchBlob(policy_output_blob)
         if isinstance(self.trainer, DiscreteActionTrainer):
             assert q_scores.shape[0] == 1
             q_scores = q_scores[0]
         q_scores_softmax = Evaluator.softmax(q_scores.reshape(
             1, -1), self.trainer.rl_temperature)[0]
         if np.isnan(
                 q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
             q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
         policies = [
             np.argmax(q_scores),
             np.random.choice(q_scores.shape[0], p=q_scores_softmax),
         ]
         return policies
コード例 #6
0
 def policy(self, states):
     if isinstance(self.trainer, DQNTrainer):
         input = states
     elif isinstance(self.trainer, ParametricDQNTrainer):
         num_actions = len(self.trainer.action_normalization_parameters)
         actions = np.eye(num_actions, dtype=np.float32)
         actions = np.tile(actions, reps=(len(states), 1))
         states = np.repeat(states, repeats=num_actions, axis=0)
         input = np.hstack((states, actions))
     else:
         raise NotImplementedError("Invalid trainer passed to GymPredictor")
     q_scores = self.trainer.internal_prediction(input)
     if isinstance(self.trainer, DQNTrainer):
         assert q_scores.shape[0] == 1
         q_scores = q_scores[0]
     q_scores_softmax = Evaluator.softmax(q_scores.reshape(1, -1),
                                          self.trainer.rl_temperature)[0]
     if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
         q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
     policies = [
         np.argmax(q_scores),
         np.random.choice(q_scores.shape[0], p=q_scores_softmax),
     ]
     return policies
コード例 #7
0
    def evaluate(self, predictor):
        # test only float features
        predictions = predictor.predict(self.logged_states)
        estimated_reward_values = predictor.estimate_reward(self.logged_states)
        if isinstance(predictor.trainer, ParametricDQNTrainer):
            predictions = predictions.reshape([-1, self._env.action_dim])
            estimated_reward_values = estimated_reward_values.reshape(
                [-1, self._env.action_dim])

        value_error_sum = 0.0
        reward_error_sum = 0.0
        for i in range(len(self.logged_states)):
            logged_action = self.logged_actions[i]
            logged_value = self.logged_values[i][0]
            target_value = predictions[i][logged_action]
            value_error_sum += abs(logged_value - target_value)
            logged_reward = self.logged_rewards[i][0]
            estimated_reward = estimated_reward_values[i][logged_action]
            reward_error_sum += abs(logged_reward - estimated_reward)
        value_error_mean = value_error_sum / np.sum(np.abs(self.logged_values))
        reward_error_mean = reward_error_sum / np.sum(
            np.abs(self.logged_rewards))

        logger.info("EVAL Q-Value MAE ERROR: {0:.3f}".format(value_error_mean))
        self.mc_loss.append(value_error_mean)
        logger.info("EVAL REWARD MAE ERROR: {0:.3f}".format(reward_error_mean))
        self.reward_loss.append(reward_error_mean)

        target_propensities = Evaluator.softmax(
            predictions, GymEvaluator.SOFTMAX_TEMPERATURE)

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}"
            .format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            ))
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}"
            .format(reward_direct_method.normalized, reward_direct_method.raw))
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}"
            .format(reward_doubly_robust.normalized, reward_doubly_robust.raw))

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            predictions,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}"
            .format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            ))
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}"
            .format(value_direct_method.normalized, value_direct_method.raw))
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}"
            .format(value_doubly_robust.normalized, value_doubly_robust.raw))

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)
        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}"
            .format(sequential_doubly_robust.normalized,
                    sequential_doubly_robust.raw))

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}"
            .format(weighted_doubly_robust.normalized,
                    weighted_doubly_robust.raw))

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
            num_j_steps=GymEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}"
            .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))

        avg_rewards, avg_discounted_rewards = self._env.run_ep_n_times(
            100, predictor, test=True)

        episode_starts = np.nonzero(self.logged_terminals.squeeze())[0] + 1
        logged_discounted_performance = (self.logged_values[0][0] + np.sum(
            self.logged_values[episode_starts[:-1]])) / np.sum(
                self.logged_terminals)

        true_discounted_value_PE = (avg_discounted_rewards /
                                    logged_discounted_performance)
        self.true_discounted_value_PE.append(true_discounted_value_PE)

        logger.info(
            "True Discounted Value P.E                    : normalized {0:.3f} raw {1:.3f}"
            .format(true_discounted_value_PE, avg_discounted_rewards))

        logged_performance = np.sum(self.logged_rewards) / np.sum(
            self.logged_terminals)

        true_value_PE = avg_rewards / logged_performance
        self.true_value_PE.append(true_value_PE)

        logger.info(
            "True Value P.E                               : normalized {0:.3f} raw {1:.3f}"
            .format(true_value_PE, avg_rewards))
コード例 #8
0
    def evaluate(self, predictor):
        # Test feeding float features & int features
        if self.use_int_features:
            float_features, int_features = self._split_int_and_float_features(
                self.logged_states)
            # Since all gridworld features are float types, swap these so
            # all inputs are now int_features for testing purpose
            float_features, int_features = int_features, float_features
            prediction_string = predictor.predict(float_features, int_features)
        # Test only feeding float features
        else:
            prediction_string = predictor.predict(self.logged_states)

        # Convert action string to integer
        prediction = np.zeros([len(prediction_string),
                               len(self._env.ACTIONS)],
                              dtype=np.float32)
        for x in range(len(self.logged_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                prediction[x][action_index] = prediction_string[x].get(
                    action, 1e-9)

        # Print out scores using all states
        all_states = []
        for x in self._env.STATES:
            all_states.append({x: 1.0})
        if self.use_int_features:
            all_states_float, all_states_int = self._split_int_and_float_features(
                all_states)
            all_states_prediction_string = predictor.predict(
                all_states_float, all_states_int)
        else:
            all_states_prediction_string = predictor.predict(all_states)
        all_states_prediction = np.zeros(
            [len(all_states_prediction_string),
             len(self._env.ACTIONS)],
            dtype=np.float32,
        )
        for x in range(len(all_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                all_states_prediction[x][
                    action_index] = all_states_prediction_string[x].get(
                        action, 1e-9)
        print(all_states_prediction[:, 0].reshape(5, 5), "\n")
        print(all_states_prediction[:, 1].reshape(5, 5), "\n")
        print(all_states_prediction[:, 2].reshape(5, 5), "\n")
        print(all_states_prediction[:, 3].reshape(5, 5), "\n")

        error_sum = 0.0
        num_error_prints = 0
        for x in range(len(self.logged_states)):
            logged_value = self.logged_values[x][0]
            target_value = prediction_string[x].get(self.logged_actions[x],
                                                    1e-9)
            error = abs(logged_value - target_value)
            if num_error_prints < 10 and error > 0.2:
                print(
                    "GOT THIS STATE WRONG: ",
                    x,
                    self._env._pos(list(self.logged_states[x].keys())[0]),
                    self.logged_actions[x],
                    logged_value,
                    target_value,
                )
                num_error_prints += 1
                if num_error_prints == 10:
                    print("MAX ERRORS PRINTED")
            error_sum += error
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE)

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}"
            .format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            ))
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}"
            .format(reward_direct_method.normalized, reward_direct_method.raw))
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}"
            .format(reward_doubly_robust.normalized, reward_doubly_robust.raw))

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}"
            .format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            ))
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}"
            .format(value_direct_method.normalized, value_direct_method.raw))
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}"
            .format(value_doubly_robust.normalized, value_doubly_robust.raw))

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)

        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}"
            .format(sequential_doubly_robust.normalized,
                    sequential_doubly_robust.raw))

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}"
            .format(weighted_doubly_robust.normalized,
                    weighted_doubly_robust.raw))

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}"
            .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))
コード例 #9
0
    def evaluate_predictions(self, prediction, all_states_prediction):
        print(all_states_prediction[:, 0].reshape(5, 5), "\n")
        print(all_states_prediction[:, 1].reshape(5, 5), "\n")
        print(all_states_prediction[:, 2].reshape(5, 5), "\n")
        print(all_states_prediction[:, 3].reshape(5, 5), "\n")

        error_sum = 0.0
        num_error_prints = 0
        for x in range(len(self.logged_states)):
            int_action = self._env.action_to_index(self.logged_actions[x])
            logged_value = self.logged_values[x][0]
            target_value = prediction[x][int_action]
            error = abs(logged_value - target_value)
            if num_error_prints < 10 and error > 0.2:
                print(
                    "GOT THIS STATE WRONG: ",
                    x,
                    self._env._pos(list(self.logged_states[x].keys())[0]),
                    self.logged_actions[x],
                    logged_value,
                    target_value,
                )
                num_error_prints += 1
                if num_error_prints == 10:
                    print("MAX ERRORS PRINTED")
            error_sum += error
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE
        )

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}".format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            )
        )
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}".format(
                reward_direct_method.normalized, reward_direct_method.raw
            )
        )
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}".format(
                reward_doubly_robust.normalized, reward_doubly_robust.raw
            )
        )

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}".format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            )
        )
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}".format(
                value_direct_method.normalized, value_direct_method.raw
            )
        )
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}".format(
                value_doubly_robust.normalized, value_doubly_robust.raw
            )
        )

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)

        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}".format(
                sequential_doubly_robust.normalized, sequential_doubly_robust.raw
            )
        )

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}".format(
                weighted_doubly_robust.normalized, weighted_doubly_robust.raw
            )
        )

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}".format(
                magic_doubly_robust.normalized, magic_doubly_robust.raw
            )
        )
コード例 #10
0
    def train(self,
              training_samples: TrainingDataPage,
              evaluator: Optional[Evaluator] = None):

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert (training_samples.states.shape[0] == self.minibatch_size
                    ), "Invalid shape: " + str(training_samples.states.shape)
            assert training_samples.actions.shape == torch.Size([
                self.minibatch_size, len(self._actions)
            ]), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.next_states.shape ==
                    training_samples.states.shape), "Invalid shape: " + str(
                        training_samples.next_states.shape)
            assert (training_samples.not_terminals.shape ==
                    training_samples.rewards.shape), "Invalid shape: " + str(
                        training_samples.not_terminals.shape)
            if training_samples.possible_next_actions is not None:
                assert (
                    training_samples.possible_next_actions.shape ==
                    training_samples.actions.shape), "Invalid shape: " + str(
                        training_samples.possible_next_actions.shape)
            if training_samples.propensities is not None:
                assert (training_samples.propensities.shape == training_samples
                        .rewards.shape), "Invalid shape: " + str(
                            training_samples.propensities.shape)

        # Apply reward boost if specified
        reward_boosts = torch.sum(training_samples.actions.float() *
                                  self.reward_boosts,
                                  dim=1,
                                  keepdim=True)
        boosted_rewards = training_samples.rewards + reward_boosts

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        next_states = training_samples.next_states
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminals

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            possible_next_actions = training_samples.possible_next_actions
            next_q_values = self.get_max_q_values(next_states,
                                                  possible_next_actions,
                                                  self.double_q_learning)
        else:
            # SARSA
            next_actions = training_samples.next_actions
            next_q_values = self.get_next_action_q_values(
                next_states, next_actions)

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor *
                                         filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = all_q_values.detach()
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        # get reward estimates
        reward_estimates = self.reward_network(states)
        self.reward_estimates = reward_estimates.detach()
        reward_estimates_for_logged_actions = reward_estimates.gather(
            1, actions.argmax(dim=1, keepdim=True))
        reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        self.loss_reporter.report(td_loss=float(self.loss),
                                  reward_loss=float(reward_loss))

        training_metadata = {}
        if evaluator is not None:

            model_propensities = torch.from_numpy(
                Evaluator.softmax(self.all_action_scores.cpu().numpy(),
                                  self.rl_temperature))

            cpe_stats = BatchStatsForCPE(
                logged_actions=training_samples.actions,
                logged_propensities=training_samples.propensities,
                logged_rewards=rewards,
                logged_values=None,  # Compute at end of each epoch for CPE
                model_propensities=model_propensities,
                model_rewards=self.reward_estimates,
                model_values=self.all_action_scores,
                model_values_on_logged_actions=
                None,  # Compute at end of each epoch for CPE
                model_action_idxs=self.all_action_scores.argmax(dim=1,
                                                                keepdim=True),
            )
            evaluator.report(cpe_stats)
            training_metadata["model_rewards"] = self.reward_estimates.cpu(
            ).numpy()

        return training_metadata
コード例 #11
0
    def train(self, training_samples: TrainingDataPage):

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert (training_samples.states.shape[0] == self.minibatch_size
                    ), "Invalid shape: " + str(training_samples.states.shape)
            assert training_samples.actions.shape == torch.Size([
                self.minibatch_size, len(self._actions)
            ]), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.next_states.shape ==
                    training_samples.states.shape), "Invalid shape: " + str(
                        training_samples.next_states.shape)
            assert (training_samples.not_terminal.shape ==
                    training_samples.rewards.shape), "Invalid shape: " + str(
                        training_samples.not_terminal.shape)
            if training_samples.possible_next_actions_mask is not None:
                assert (
                    training_samples.possible_next_actions_mask.shape ==
                    training_samples.actions.shape), (
                        "Invalid shape: " +
                        str(training_samples.possible_next_actions_mask.shape))
            if training_samples.propensities is not None:
                assert (training_samples.propensities.shape == training_samples
                        .rewards.shape), "Invalid shape: " + str(
                            training_samples.propensities.shape)
            if training_samples.metrics is not None:
                assert (
                    training_samples.metrics.shape[0] == self.minibatch_size
                ), "Invalid shape: " + str(training_samples.metrics.shape)

        boosted_rewards = self.boost_rewards(training_samples.rewards,
                                             training_samples.actions)

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminal

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        all_next_q_values, all_next_q_values_target = self.get_detached_q_values(
            training_samples.next_states)
        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            next_q_values, max_q_action_idxs = self.get_max_q_values(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.possible_next_actions_mask,
            )
        else:
            # SARSA
            next_q_values, max_q_action_idxs = self.get_max_q_values(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.next_actions,
            )

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor *
                                         filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = all_q_values.detach()
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        if training_samples.metrics is None:
            metrics_reward_concat_real_vals = training_samples.rewards
        else:
            metrics_reward_concat_real_vals = torch.cat(
                (training_samples.metrics, training_samples.rewards), dim=1)

        ######### Train separate reward network for CPE evaluation #############
        reward_estimates = self.reward_network(states)
        logged_action_idxs = actions.argmax(dim=1, keepdim=True)
        reward_estimates_for_logged_actions = reward_estimates.gather(
            1, self.reward_idx_offsets + logged_action_idxs)
        reward_loss = F.mse_loss(reward_estimates_for_logged_actions,
                                 metrics_reward_concat_real_vals)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        ######### Train separate q-network for CPE evaluation #############
        metric_q_values = self.q_network_cpe(states).gather(
            1, self.reward_idx_offsets + logged_action_idxs)
        metric_target_q_values = self.q_network_cpe_target(states).detach()
        max_q_values_metrics = metric_target_q_values.gather(
            1, self.reward_idx_offsets + max_q_action_idxs)
        filtered_max_q_values_metrics = max_q_values_metrics * not_done_mask
        if self.minibatch < self.reward_burnin:
            target_metric_q_values = metrics_reward_concat_real_vals
        else:
            target_metric_q_values = metrics_reward_concat_real_vals + (
                discount_tensor * filtered_max_q_values_metrics)
        metric_q_value_loss = self.q_network_loss(metric_q_values,
                                                  target_metric_q_values)
        self.q_network_cpe.zero_grad()
        metric_q_value_loss.backward()
        self.q_network_cpe_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network_cpe, self.q_network_cpe_target,
                              1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network_cpe, self.q_network_cpe_target,
                              self.tau)

        model_propensities = torch.from_numpy(
            Evaluator.softmax(self.all_action_scores.cpu().numpy(),
                              self.rl_temperature))
        self.loss_reporter.report(
            td_loss=self.loss,
            reward_loss=reward_loss,
            logged_actions=logged_action_idxs,
            logged_propensities=training_samples.propensities,
            logged_rewards=rewards,
            logged_values=None,  # Compute at end of each epoch for CPE
            model_propensities=model_propensities,
            model_rewards=reward_estimates[:,
                                           torch.arange(
                                               self.reward_idx_offsets[0],
                                               self.reward_idx_offsets[0] +
                                               self.num_actions,
                                           ), ],
            model_values=self.all_action_scores,
            model_values_on_logged_actions=
            None,  # Compute at end of each epoch for CPE
            model_action_idxs=self.all_action_scores.argmax(dim=1,
                                                            keepdim=True),
        )

        training_metadata = {}
        training_metadata["model_rewards"] = reward_estimates.detach().cpu(
        ).numpy()
        return training_metadata
コード例 #12
0
    def evaluate(self, predictor):
        # Test feeding float features & int features
        if self.use_int_features:
            float_features, int_features = self._split_int_and_float_features(
                self.logged_states)
            # Since all gridworld features are float types, swap these so
            # all inputs are now int_features for testing purpose
            float_features, int_features = int_features, float_features
            prediction_string = predictor.predict(float_features, int_features)
        # Test only feeding float features
        else:
            prediction_string = predictor.predict(self.logged_states)

        # Convert action string to integer
        prediction = np.zeros([len(prediction_string),
                               len(self._env.ACTIONS)],
                              dtype=np.float32)
        for x in range(len(self.logged_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                prediction[x][action_index] = prediction_string[x][action]

        error_sum = 0.0
        for x in range(len(self.logged_states)):
            logged_value = self.logged_values[x][0]
            target_value = prediction_string[x][self.logged_actions[x]]
            error_sum += abs(logged_value - target_value)
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE)

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info("Value Inverse Propensity Score : {0:.3f}".format(
            value_inverse_propensity_score))
        logger.info("Value Direct Method            : {0:.3f}".format(
            value_direct_method))
        logger.info("Value Doubly Robust P.E.       : {0:.3f}".format(
            value_doubly_robust))

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info("Reward Inverse Propensity Score: {0:.3f}".format(
            reward_inverse_propensity_score))
        logger.info("Reward Direct Method           : {0:.3f}".format(
            reward_direct_method))
        logger.info("Reward Doubly Robust P.E.      : {0:.3f}".format(
            reward_doubly_robust))