Esempio n. 1
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        workspace.RunNet(self.all_q_score_model.net)
        all_action_scores = workspace.FetchBlob(self.all_q_score_output)
        maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs)
        model_values_on_logged_actions = np.sum(
            (logged_actions * all_action_scores), axis=1, keepdims=True)
        model_propensities = Evaluator.softmax(all_action_scores,
                                               self.rl_temperature)
        logged_rewards = workspace.FetchBlob("rewards")

        evaluator.report(
            workspace.FetchBlob(self.loss_blob),
            logged_actions,
            logged_propensities,
            logged_rewards,
            logged_values,
            model_propensities,
            all_action_scores,
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
Esempio n. 2
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_rewards: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = (
            None,
            None,
            None,
        )
        if self.all_action_scores is not None:
            self.all_action_scores = self.all_action_scores.cpu().numpy()
            self.model_propensities = Evaluator.softmax(
                self.all_action_scores, self.rl_temperature)
            maxq_action_idxs = self.all_action_scores.argmax(axis=1)
            if logged_actions is not None:
                model_values_on_logged_actions = np.sum(
                    (logged_actions * self.all_action_scores),
                    axis=1,
                    keepdims=True)

        evaluator.report(
            self.loss.cpu().numpy(),
            logged_actions,
            logged_propensities,
            logged_rewards,
            logged_values,
            self.model_propensities,
            self.all_action_scores,
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
Esempio n. 3
0
    def __init__(self,
                 env,
                 assume_optimal_policy: bool,
                 use_int_features: bool = False) -> None:
        Evaluator.__init__(self, 1)

        self._env = env

        samples = env.generate_samples(200000, 1.0)
        self.logged_states = samples.states
        self.logged_actions = samples.actions
        self.logged_propensities = np.array(samples.propensities).reshape(
            -1, 1)
        # Create integer logged actions
        self.logged_actions_int: List[int] = []
        for action in self.logged_actions:
            self.logged_actions_int.append(self._env.action_to_index(action))

        self.logged_actions_one_hot = np.zeros(
            [len(self.logged_actions),
             len(env.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(self.logged_actions):
            self.logged_actions_one_hot[i, env.action_to_index(action)] = 1

        self.logged_values = env.true_values_for_sample(
            self.logged_states, self.logged_actions, assume_optimal_policy)
        self.logged_rewards = env.true_rewards_for_sample(
            self.logged_states, self.logged_actions)

        self.estimated_ltv_values = np.zeros(
            [len(self.logged_states),
             len(self._env.ACTIONS)],
            dtype=np.float32)
        for action in range(len(self._env.ACTIONS)):
            self.estimated_ltv_values[:,
                                      action] = self._env.true_values_for_sample(
                                          self.logged_states,
                                          [self._env.index_to_action(action)] *
                                          len(self.logged_states),
                                          True,
                                      ).flatten()

        self.estimated_reward_values = np.zeros(
            [len(self.logged_states),
             len(self._env.ACTIONS)],
            dtype=np.float32)
        for action in range(len(self._env.ACTIONS)):
            self.estimated_reward_values[:,
                                         action] = self._env.true_rewards_for_sample(
                                             self.logged_states,
                                             [
                                                 self._env.index_to_action(
                                                     action)
                                             ] * len(self.logged_states),
                                         ).flatten()

        self.use_int_features = use_int_features
 def evaluate(self, evaluator: Evaluator, logged_value: Optional[torch.Tensor]):
     evaluator.report(
         self.loss.cpu().numpy(),
         None,
         None,
         None,
         logged_value.cpu().numpy() if logged_value is not None else None,
         None,
         None,
         None,
         self.all_action_scores.cpu().numpy(),
         None,
     )
Esempio n. 5
0
 def evaluate(self, evaluator: Evaluator):
     # FIXME
     evaluator.report(
         self.loss.cpu().numpy(),
         None,
         None,
         None,
         None,
         None,
         None,
         None,
         self.all_action_scores.cpu().numpy(),
         None,
     )
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, _ = environment.generate_samples(100000, 1.0)
        true_values = environment.true_values_for_sample(
            states, actions, False)
        # Hijack the reward timeline to insert the ground truth
        reward_timelines = []
        for tv in true_values:
            reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)
        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )

        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.05)
        self.assertLess(evaluator.mc_loss[-1], 0.12)
Esempio n. 7
0
 def policy(self, states):
     with core.DeviceScope(self.c2_device):
         if isinstance(self.trainer, DiscreteActionTrainer):
             workspace.FeedBlob("states", states)
         elif isinstance(self.trainer, ContinuousActionDQNTrainer):
             num_actions = len(self.trainer.action_normalization_parameters)
             actions = np.eye(num_actions, dtype=np.float32)
             actions = np.tile(actions, reps=(len(states), 1))
             states = np.repeat(states, repeats=num_actions, axis=0)
             workspace.FeedBlob("states", states)
             workspace.FeedBlob("actions", actions)
         else:
             raise NotImplementedError(
                 "Invalid trainer passed to GymPredictor")
         workspace.RunNetOnce(self.trainer.internal_policy_model.net)
         policy_output_blob = self.trainer.internal_policy_output
         q_scores = workspace.FetchBlob(policy_output_blob)
         if isinstance(self.trainer, DiscreteActionTrainer):
             assert q_scores.shape[0] == 1
             q_scores = q_scores[0]
         q_scores_softmax = Evaluator.softmax(q_scores.reshape(
             1, -1), self.trainer.rl_temperature)[0]
         if np.isnan(
                 q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
             q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
         policies = [
             np.argmax(q_scores),
             np.random.choice(q_scores.shape[0], p=q_scores_softmax),
         ]
         return policies
Esempio n. 8
0
 def evaluate(
     self,
     evaluator: Evaluator,
     logged_actions: Optional[np.ndarray],
     logged_propensities: Optional[np.ndarray],
     logged_values: Optional[np.ndarray],
 ):
     evaluator.report(
         self.loss.cpu().numpy(),
         None,
         None,
         None,
         logged_values,
         None,
         None,
         self.all_action_scores.cpu().numpy(),
         None,
     )
Esempio n. 9
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: torch.Tensor,
        logged_propensities: Optional[torch.Tensor],
        logged_rewards: torch.Tensor,
        logged_values: Optional[torch.Tensor],
    ):
        self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = (
            None,
            None,
            None,
        )
        if self.all_action_scores is not None:
            self.all_action_scores = self.all_action_scores
            self.model_propensities = Evaluator.softmax(
                self.all_action_scores.cpu().numpy(), self.rl_temperature
            )
            maxq_action_idxs = self.all_action_scores.argmax(dim=1, keepdim=True)
            if logged_actions is not None:
                model_values_on_logged_actions = (
                    torch.sum(
                        (logged_actions * self.all_action_scores), dim=1, keepdim=True
                    )
                    .cpu()
                    .numpy()
                )

        evaluator.report(
            self.loss.cpu().numpy(),
            logged_actions.cpu().numpy(),
            logged_propensities.cpu().numpy()
            if logged_propensities is not None
            else None,
            logged_rewards.cpu().numpy(),
            logged_values.cpu().numpy() if logged_values is not None else None,
            self.model_propensities,
            self.reward_estimates.cpu().numpy(),
            self.all_action_scores.cpu().numpy(),
            model_values_on_logged_actions,
            maxq_action_idxs,
        )
Esempio n. 10
0
    def evaluate(
        self,
        evaluator: Evaluator,
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        workspace.RunNet(self.q_score_model.net)
        model_values_on_logged_actions = workspace.FetchBlob(self.q_score_output)

        evaluator.report(
            workspace.FetchBlob(self.loss_blob),
            None,
            None,
            None,
            logged_values,
            None,
            None,
            model_values_on_logged_actions,
        )
Esempio n. 11
0
    def test_evaluator_timeline(self):
        environment = Gridworld()
        samples = environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(1)

        tdps = environment.preprocess_samples(samples, self.minibatch_size)
        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Esempio n. 12
0
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(500000, 1.0, DISCOUNT)
        # Hijack the reward timeline to insert the ground truth
        samples.episode_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(None, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.15)
    def test_evaluator_timeline(self, environment):
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)

        tdp = environment.preprocess_samples(states, actions, rewards,
                                             next_states, next_actions,
                                             is_terminal,
                                             possible_next_actions,
                                             reward_timelines)
        trainer.stream_tdp(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Esempio n. 14
0
    def test_compute_episode_value_from_samples(self):
        samples = [
            MockSample("1", 3, 1),
            MockSample("1", 5, 2),
            MockSample("1", 6, 1),
            MockSample("3", 10, 2),
            MockSample("3", 11, 1),
            MockSample("6", 2, 3),
            MockSample("6", 4, 2),
            MockSample("6", 5, 0),
            MockSample("6", 8, 1),
        ]
        logged_values = Evaluator.compute_episode_value_from_samples(
            samples, 0.5)
        expected_values = [1.625, 2.5, 1, 2.5, 1, 3.515625, 2.0625, 0.125, 1]

        for i, val in enumerate(logged_values):
            self.assertEquals(val, expected_values[i])
Esempio n. 15
0
    def test_evaluator_ground_truth(self):
        environment = Gridworld()
        samples = environment.generate_samples(200000, 1.0)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.reward_timelines = []
        for tv in true_values:
            samples.reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for _ in range(2):
            for tdp in tdps:
                trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 16
0
    def test_evaluator_ground_truth(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(200000, 1.0)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.reward_timelines = []
        for tv in true_values:
            samples.reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(None, 10, DISCOUNT)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.15)
    def test_evaluator_ground_truth(self, environment):
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, _ = environment.generate_samples(100000, 1.0)
        true_values = environment.true_values_for_sample(
            states, actions, False)
        # Hijack the reward timeline to insert the ground truth
        reward_timelines = []
        for tv in true_values:
            reward_timelines.append({0: tv})
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)
        tdp = environment.preprocess_samples(states, actions, rewards,
                                             next_states, next_actions,
                                             is_terminal,
                                             possible_next_actions,
                                             reward_timelines)

        trainer.stream_tdp(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.05)
        self.assertLess(evaluator.mc_loss[-1], 0.12)
    def _test_evaluator_ground_truth_dueling(self,
                                             use_gpu=False,
                                             use_all_avail_gpus=False):
        environment = Gridworld()
        samples = environment.generate_samples(100000, 1.0, DISCOUNT)
        true_values = environment.true_values_for_sample(
            samples.states, samples.actions, False)
        # Hijack the reward timeline to insert the ground truth
        samples.episode_values = true_values
        trainer = self.get_sarsa_trainer(environment,
                                         True,
                                         use_gpu=use_gpu,
                                         use_all_avail_gpus=use_all_avail_gpus)
        evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None)
        tdps = environment.preprocess_samples(samples,
                                              self.minibatch_size,
                                              use_gpu=use_gpu)

        for tdp in tdps:
            trainer.train(tdp, evaluator)

        self.assertLess(evaluator.mc_loss[-1], 0.1)
Esempio n. 19
0
 def policy(self, states):
     with core.DeviceScope(self.c2_device):
         if isinstance(self.trainer, DiscreteActionTrainer):
             workspace.FeedBlob("states", states)
         else:
             raise NotImplementedError(
                 "Invalid trainer passed to GymPredictor")
         workspace.RunNetOnce(self.trainer.internal_policy_model.net)
         policy_output_blob = self.trainer.internal_policy_output
         q_scores = workspace.FetchBlob(policy_output_blob)
         if isinstance(self.trainer, DiscreteActionTrainer):
             assert q_scores.shape[0] == 1
             q_scores = q_scores[0]
         q_scores_softmax = Evaluator.softmax(q_scores.reshape(
             1, -1), self.trainer.rl_temperature)[0]
         if np.isnan(
                 q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
             q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
         policies = [
             np.argmax(q_scores),
             np.random.choice(q_scores.shape[0], p=q_scores_softmax),
         ]
         return policies
    def test_evaluator_timeline(self):
        environment = GridworldContinuous()
        states, actions, rewards, next_states, next_actions, is_terminal,\
            possible_next_actions, reward_timelines = \
            environment.generate_samples(100000, 1.0)
        trainer = self.get_sarsa_trainer(environment)
        evaluator = Evaluator(trainer, DISCOUNT)

        tdps = environment.preprocess_samples(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            is_terminal,
            possible_next_actions,
            reward_timelines,
            self.minibatch_size,
        )
        for tdp in tdps:
            trainer.train_numpy(tdp, evaluator)

        self.assertLess(evaluator.td_loss[-1], 0.2)
        self.assertLess(evaluator.mc_loss[-1], 0.2)
Esempio n. 21
0
 def policy(self, states):
     if isinstance(self.trainer, DQNTrainer):
         input = states
     elif isinstance(self.trainer, ParametricDQNTrainer):
         num_actions = len(self.trainer.action_normalization_parameters)
         actions = np.eye(num_actions, dtype=np.float32)
         actions = np.tile(actions, reps=(len(states), 1))
         states = np.repeat(states, repeats=num_actions, axis=0)
         input = np.hstack((states, actions))
     else:
         raise NotImplementedError("Invalid trainer passed to GymPredictor")
     q_scores = self.trainer.internal_prediction(input)
     if isinstance(self.trainer, DQNTrainer):
         assert q_scores.shape[0] == 1
         q_scores = q_scores[0]
     q_scores_softmax = Evaluator.softmax(q_scores.reshape(1, -1),
                                          self.trainer.rl_temperature)[0]
     if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3:
         q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0]
     policies = [
         np.argmax(q_scores),
         np.random.choice(q_scores.shape[0], p=q_scores_softmax),
     ]
     return policies
Esempio n. 22
0
    def evaluate(self, predictor):
        # test only float features
        predictions = predictor.predict(self.logged_states)
        estimated_reward_values = predictor.estimate_reward(self.logged_states)
        if isinstance(predictor.trainer, ParametricDQNTrainer):
            predictions = predictions.reshape([-1, self._env.action_dim])
            estimated_reward_values = estimated_reward_values.reshape(
                [-1, self._env.action_dim])

        value_error_sum = 0.0
        reward_error_sum = 0.0
        for i in range(len(self.logged_states)):
            logged_action = self.logged_actions[i]
            logged_value = self.logged_values[i][0]
            target_value = predictions[i][logged_action]
            value_error_sum += abs(logged_value - target_value)
            logged_reward = self.logged_rewards[i][0]
            estimated_reward = estimated_reward_values[i][logged_action]
            reward_error_sum += abs(logged_reward - estimated_reward)
        value_error_mean = value_error_sum / np.sum(np.abs(self.logged_values))
        reward_error_mean = reward_error_sum / np.sum(
            np.abs(self.logged_rewards))

        logger.info("EVAL Q-Value MAE ERROR: {0:.3f}".format(value_error_mean))
        self.mc_loss.append(value_error_mean)
        logger.info("EVAL REWARD MAE ERROR: {0:.3f}".format(reward_error_mean))
        self.reward_loss.append(reward_error_mean)

        target_propensities = Evaluator.softmax(
            predictions, GymEvaluator.SOFTMAX_TEMPERATURE)

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}"
            .format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            ))
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}"
            .format(reward_direct_method.normalized, reward_direct_method.raw))
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}"
            .format(reward_doubly_robust.normalized, reward_doubly_robust.raw))

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            predictions,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}"
            .format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            ))
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}"
            .format(value_direct_method.normalized, value_direct_method.raw))
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}"
            .format(value_doubly_robust.normalized, value_doubly_robust.raw))

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)
        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}"
            .format(sequential_doubly_robust.normalized,
                    sequential_doubly_robust.raw))

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}"
            .format(weighted_doubly_robust.normalized,
                    weighted_doubly_robust.raw))

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            predictions,
            num_j_steps=GymEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}"
            .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))

        avg_rewards, avg_discounted_rewards = self._env.run_ep_n_times(
            100, predictor, test=True)

        episode_starts = np.nonzero(self.logged_terminals.squeeze())[0] + 1
        logged_discounted_performance = (self.logged_values[0][0] + np.sum(
            self.logged_values[episode_starts[:-1]])) / np.sum(
                self.logged_terminals)

        true_discounted_value_PE = (avg_discounted_rewards /
                                    logged_discounted_performance)
        self.true_discounted_value_PE.append(true_discounted_value_PE)

        logger.info(
            "True Discounted Value P.E                    : normalized {0:.3f} raw {1:.3f}"
            .format(true_discounted_value_PE, avg_discounted_rewards))

        logged_performance = np.sum(self.logged_rewards) / np.sum(
            self.logged_terminals)

        true_value_PE = avg_rewards / logged_performance
        self.true_value_PE.append(true_value_PE)

        logger.info(
            "True Value P.E                               : normalized {0:.3f} raw {1:.3f}"
            .format(true_value_PE, avg_rewards))
Esempio n. 23
0
    def evaluate(self, predictor):
        # Test feeding float features & int features
        if self.use_int_features:
            float_features, int_features = self._split_int_and_float_features(
                self.logged_states)
            # Since all gridworld features are float types, swap these so
            # all inputs are now int_features for testing purpose
            float_features, int_features = int_features, float_features
            prediction_string = predictor.predict(float_features, int_features)
        # Test only feeding float features
        else:
            prediction_string = predictor.predict(self.logged_states)

        # Convert action string to integer
        prediction = np.zeros([len(prediction_string),
                               len(self._env.ACTIONS)],
                              dtype=np.float32)
        for x in range(len(self.logged_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                prediction[x][action_index] = prediction_string[x].get(
                    action, 1e-9)

        # Print out scores using all states
        all_states = []
        for x in self._env.STATES:
            all_states.append({x: 1.0})
        if self.use_int_features:
            all_states_float, all_states_int = self._split_int_and_float_features(
                all_states)
            all_states_prediction_string = predictor.predict(
                all_states_float, all_states_int)
        else:
            all_states_prediction_string = predictor.predict(all_states)
        all_states_prediction = np.zeros(
            [len(all_states_prediction_string),
             len(self._env.ACTIONS)],
            dtype=np.float32,
        )
        for x in range(len(all_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                all_states_prediction[x][
                    action_index] = all_states_prediction_string[x].get(
                        action, 1e-9)
        print(all_states_prediction[:, 0].reshape(5, 5), "\n")
        print(all_states_prediction[:, 1].reshape(5, 5), "\n")
        print(all_states_prediction[:, 2].reshape(5, 5), "\n")
        print(all_states_prediction[:, 3].reshape(5, 5), "\n")

        error_sum = 0.0
        num_error_prints = 0
        for x in range(len(self.logged_states)):
            logged_value = self.logged_values[x][0]
            target_value = prediction_string[x].get(self.logged_actions[x],
                                                    1e-9)
            error = abs(logged_value - target_value)
            if num_error_prints < 10 and error > 0.2:
                print(
                    "GOT THIS STATE WRONG: ",
                    x,
                    self._env._pos(list(self.logged_states[x].keys())[0]),
                    self.logged_actions[x],
                    logged_value,
                    target_value,
                )
                num_error_prints += 1
                if num_error_prints == 10:
                    print("MAX ERRORS PRINTED")
            error_sum += error
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE)

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}"
            .format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            ))
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}"
            .format(reward_direct_method.normalized, reward_direct_method.raw))
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}"
            .format(reward_doubly_robust.normalized, reward_doubly_robust.raw))

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}"
            .format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            ))
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}"
            .format(value_direct_method.normalized, value_direct_method.raw))
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}"
            .format(value_doubly_robust.normalized, value_doubly_robust.raw))

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)

        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}"
            .format(sequential_doubly_robust.normalized,
                    sequential_doubly_robust.raw))

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}"
            .format(weighted_doubly_robust.normalized,
                    weighted_doubly_robust.raw))

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}"
            .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))
Esempio n. 24
0
    def evaluate_predictions(self, prediction, all_states_prediction):
        print(all_states_prediction[:, 0].reshape(5, 5), "\n")
        print(all_states_prediction[:, 1].reshape(5, 5), "\n")
        print(all_states_prediction[:, 2].reshape(5, 5), "\n")
        print(all_states_prediction[:, 3].reshape(5, 5), "\n")

        error_sum = 0.0
        num_error_prints = 0
        for x in range(len(self.logged_states)):
            int_action = self._env.action_to_index(self.logged_actions[x])
            logged_value = self.logged_values[x][0]
            target_value = prediction[x][int_action]
            error = abs(logged_value - target_value)
            if num_error_prints < 10 and error > 0.2:
                print(
                    "GOT THIS STATE WRONG: ",
                    x,
                    self._env._pos(list(self.logged_states[x].keys())[0]),
                    self.logged_actions[x],
                    logged_value,
                    target_value,
                )
                num_error_prints += 1
                if num_error_prints == 10:
                    print("MAX ERRORS PRINTED")
            error_sum += error
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE
        )

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info(
            "Reward Inverse Propensity Score              : normalized {0:.3f} raw {1:.3f}".format(
                reward_inverse_propensity_score.normalized,
                reward_inverse_propensity_score.raw,
            )
        )
        logger.info(
            "Reward Direct Method                         : normalized {0:.3f} raw {1:.3f}".format(
                reward_direct_method.normalized, reward_direct_method.raw
            )
        )
        logger.info(
            "Reward Doubly Robust P.E.                    : normalized {0:.3f} raw {1:.3f}".format(
                reward_doubly_robust.normalized, reward_doubly_robust.raw
            )
        )

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info(
            "Value Inverse Propensity Score               : normalized {0:.3f} raw {1:.3f}".format(
                value_inverse_propensity_score.normalized,
                value_inverse_propensity_score.raw,
            )
        )
        logger.info(
            "Value Direct Method                          : normalized {0:.3f} raw {1:.3f}".format(
                value_direct_method.normalized, value_direct_method.raw
            )
        )
        logger.info(
            "Value One-Step Doubly Robust P.E.            : normalized {0:.3f} raw {1:.3f}".format(
                value_doubly_robust.normalized, value_doubly_robust.raw
            )
        )

        sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_sequential_doubly_robust.append(sequential_doubly_robust)

        logger.info(
            "Value Sequential Doubly Robust P.E.          : normalized {0:.3f} raw {1:.3f}".format(
                sequential_doubly_robust.normalized, sequential_doubly_robust.raw
            )
        )

        weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=1,
            whether_self_normalize_importance_weights=True,
        )
        self.value_weighted_doubly_robust.append(weighted_doubly_robust)

        logger.info(
            "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}".format(
                weighted_doubly_robust.normalized, weighted_doubly_robust.raw
            )
        )

        magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_terminals,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
            num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR,
            whether_self_normalize_importance_weights=True,
        )
        self.value_magic_doubly_robust.append(magic_doubly_robust)

        logger.info(
            "Value Magic Doubly Robust P.E.               : normalized {0:.3f} raw {1:.3f}".format(
                magic_doubly_robust.normalized, magic_doubly_robust.raw
            )
        )
Esempio n. 25
0
    def train(self,
              training_samples: TrainingDataPage,
              evaluator: Optional[Evaluator] = None):

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert (training_samples.states.shape[0] == self.minibatch_size
                    ), "Invalid shape: " + str(training_samples.states.shape)
            assert training_samples.actions.shape == torch.Size([
                self.minibatch_size, len(self._actions)
            ]), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.next_states.shape ==
                    training_samples.states.shape), "Invalid shape: " + str(
                        training_samples.next_states.shape)
            assert (training_samples.not_terminals.shape ==
                    training_samples.rewards.shape), "Invalid shape: " + str(
                        training_samples.not_terminals.shape)
            if training_samples.possible_next_actions is not None:
                assert (
                    training_samples.possible_next_actions.shape ==
                    training_samples.actions.shape), "Invalid shape: " + str(
                        training_samples.possible_next_actions.shape)
            if training_samples.propensities is not None:
                assert (training_samples.propensities.shape == training_samples
                        .rewards.shape), "Invalid shape: " + str(
                            training_samples.propensities.shape)

        # Apply reward boost if specified
        reward_boosts = torch.sum(training_samples.actions.float() *
                                  self.reward_boosts,
                                  dim=1,
                                  keepdim=True)
        boosted_rewards = training_samples.rewards + reward_boosts

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        next_states = training_samples.next_states
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminals

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            possible_next_actions = training_samples.possible_next_actions
            next_q_values = self.get_max_q_values(next_states,
                                                  possible_next_actions,
                                                  self.double_q_learning)
        else:
            # SARSA
            next_actions = training_samples.next_actions
            next_q_values = self.get_next_action_q_values(
                next_states, next_actions)

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor *
                                         filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = all_q_values.detach()
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        # get reward estimates
        reward_estimates = self.reward_network(states)
        self.reward_estimates = reward_estimates.detach()
        reward_estimates_for_logged_actions = reward_estimates.gather(
            1, actions.argmax(dim=1, keepdim=True))
        reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        self.loss_reporter.report(td_loss=float(self.loss),
                                  reward_loss=float(reward_loss))

        training_metadata = {}
        if evaluator is not None:

            model_propensities = torch.from_numpy(
                Evaluator.softmax(self.all_action_scores.cpu().numpy(),
                                  self.rl_temperature))

            cpe_stats = BatchStatsForCPE(
                logged_actions=training_samples.actions,
                logged_propensities=training_samples.propensities,
                logged_rewards=rewards,
                logged_values=None,  # Compute at end of each epoch for CPE
                model_propensities=model_propensities,
                model_rewards=self.reward_estimates,
                model_values=self.all_action_scores,
                model_values_on_logged_actions=
                None,  # Compute at end of each epoch for CPE
                model_action_idxs=self.all_action_scores.argmax(dim=1,
                                                                keepdim=True),
            )
            evaluator.report(cpe_stats)
            training_metadata["model_rewards"] = self.reward_estimates.cpu(
            ).numpy()

        return training_metadata
Esempio n. 26
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"])
    else:
        in_training_cpe_parameters = None

    trainer_params = ContinuousActionModelParameters(
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    trainer = ParametricDQNTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, params["use_gpu"])
    action_preprocessor = Preprocessor(action_normalization, params["use_gpu"])

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
        )
    else:
        evaluator = Evaluator(
            None,
            100,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
        )

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)

            tdp.set_type(trainer.dtype)
            trainer.train(tdp, evaluator)

            evaluator.collect_parametric_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers.cpu().numpy(),
                logged_state_actions=np.concatenate(
                    (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()),
                    axis=1),
                logged_rewards=tdp.rewards.cpu().numpy(),
                logged_propensities=tdp.propensities.cpu().numpy(),
                logged_terminals=(1.0 - tdp.not_terminals),
                possible_state_actions=tdp.state_pas_concat.cpu().numpy(),
                pas_lens=tdp.possible_actions_lengths.cpu().numpy(),
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe(trainer_params.rl.gamma)
        evaluator.clear_collected_samples()
        logger.info("CPE evaluation took {} seconds.".format(time.time() -
                                                             cpe_start_time))

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Esempio n. 27
0
def train_network(params):
    writer = None
    if params["model_output_path"] is not None:
        writer = SummaryWriter(
            log_dir=os.path.join(
                os.path.expanduser(params["model_output_path"]), "training_data"
            )
        )

    logger.info("Running DQN workflow with params:")
    logger.info(params)

    action_names = np.array(params["actions"])
    rl_parameters = RLParameters(**params["rl"])
    training_parameters = TrainingParameters(**params["training"])
    rainbow_parameters = RainbowDQNParameters(**params["rainbow"])
    if params["in_training_cpe"] is not None:
        in_training_cpe_parameters = InTrainingCPEParameters(
            **params["in_training_cpe"]
        )
    else:
        in_training_cpe_parameters = None

    trainer_params = DiscreteActionModelParameters(
        actions=params["actions"],
        rl=rl_parameters,
        training=training_parameters,
        rainbow=rainbow_parameters,
        in_training_cpe=in_training_cpe_parameters,
    )

    dataset = JSONDataset(
        params["training_data_path"], batch_size=training_parameters.minibatch_size
    )
    state_normalization = read_norm_file(params["state_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info(
        "Read in batch data set {} of size {} examples. Data split "
        "into {} batches of size {}.".format(
            params["training_data_path"],
            len(dataset),
            num_batches,
            training_parameters.minibatch_size,
        )
    )

    trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"])
    trainer = update_model_for_warm_start(trainer)
    preprocessor = Preprocessor(state_normalization, params["use_gpu"])

    if trainer_params.in_training_cpe is not None:
        evaluator = Evaluator(
            trainer_params.actions,
            10,
            trainer_params.rl.gamma,
            trainer,
            trainer_params.in_training_cpe.mdp_sampled_rate,
        )
    else:
        evaluator = Evaluator(
            trainer_params.actions,
            10,
            trainer_params.rl.gamma,
            trainer,
            float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset),
        )

    start_time = time.time()
    for epoch in range(int(params["epochs"])):
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch, int(params["epochs"]))
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(preprocessor, batch, action_names)

            trainer.train(tdp)

            trainer.evaluate(
                evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values
            )

            evaluator.collect_discrete_action_samples(
                mdp_ids=tdp.mdp_ids,
                sequence_numbers=tdp.sequence_numbers.cpu().numpy(),
                states=tdp.states.cpu().numpy(),
                logged_actions=tdp.actions.cpu().numpy(),
                logged_rewards=tdp.rewards.cpu().numpy(),
                logged_propensities=tdp.propensities.cpu().numpy(),
                logged_terminals=np.invert(
                    tdp.not_terminals.cpu().numpy().astype(np.bool)
                ),
            )

        cpe_start_time = time.time()
        evaluator.recover_samples_to_be_unshuffled()
        evaluator.score_cpe()
        if writer is not None:
            evaluator.log_to_tensorboard(writer, epoch)
        evaluator.clear_collected_samples()
        logger.info(
            "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)
        )

    through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time)
    logger.info(
        "Training finished. Processed ~{} examples / s.".format(round(through_put))
    )

    if writer is not None:
        writer.close()

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Esempio n. 28
0
    def evaluate(self, predictor):
        # Test feeding float features & int features
        if self.use_int_features:
            float_features, int_features = self._split_int_and_float_features(
                self.logged_states)
            # Since all gridworld features are float types, swap these so
            # all inputs are now int_features for testing purpose
            float_features, int_features = int_features, float_features
            prediction_string = predictor.predict(float_features, int_features)
        # Test only feeding float features
        else:
            prediction_string = predictor.predict(self.logged_states)

        # Convert action string to integer
        prediction = np.zeros([len(prediction_string),
                               len(self._env.ACTIONS)],
                              dtype=np.float32)
        for x in range(len(self.logged_states)):
            for action_index, action in enumerate(self._env.ACTIONS):
                prediction[x][action_index] = prediction_string[x][action]

        error_sum = 0.0
        for x in range(len(self.logged_states)):
            logged_value = self.logged_values[x][0]
            target_value = prediction_string[x][self.logged_actions[x]]
            error_sum += abs(logged_value - target_value)
        error_mean = error_sum / float(len(self.logged_states))

        logger.info("EVAL ERROR: {0:.3f}".format(error_mean))
        self.mc_loss.append(error_mean)

        target_propensities = Evaluator.softmax(
            prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE)

        value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_values,
            self.logged_propensities,
            target_propensities,
            self.estimated_ltv_values,
        )
        self.value_inverse_propensity_score.append(
            value_inverse_propensity_score)
        self.value_direct_method.append(value_direct_method)
        self.value_doubly_robust.append(value_doubly_robust)

        logger.info("Value Inverse Propensity Score : {0:.3f}".format(
            value_inverse_propensity_score))
        logger.info("Value Direct Method            : {0:.3f}".format(
            value_direct_method))
        logger.info("Value Doubly Robust P.E.       : {0:.3f}".format(
            value_doubly_robust))

        reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_policy_estimation(
            self.logged_actions_one_hot,
            self.logged_rewards,
            self.logged_propensities,
            target_propensities,
            self.estimated_reward_values,
        )
        self.reward_inverse_propensity_score.append(
            reward_inverse_propensity_score)
        self.reward_direct_method.append(reward_direct_method)
        self.reward_doubly_robust.append(reward_doubly_robust)

        logger.info("Reward Inverse Propensity Score: {0:.3f}".format(
            reward_inverse_propensity_score))
        logger.info("Reward Direct Method           : {0:.3f}".format(
            reward_direct_method))
        logger.info("Reward Doubly Robust P.E.      : {0:.3f}".format(
            reward_doubly_robust))
Esempio n. 29
0
 def get_recent_reward_loss(self):
     return Evaluator.calculate_recent_window_average(
         self.reward_loss, Evaluator.RECENT_WINDOW_SIZE, num_entries=1)
Esempio n. 30
0
    def train(self, training_samples: TrainingDataPage):

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert (training_samples.states.shape[0] == self.minibatch_size
                    ), "Invalid shape: " + str(training_samples.states.shape)
            assert training_samples.actions.shape == torch.Size([
                self.minibatch_size, len(self._actions)
            ]), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.next_states.shape ==
                    training_samples.states.shape), "Invalid shape: " + str(
                        training_samples.next_states.shape)
            assert (training_samples.not_terminal.shape ==
                    training_samples.rewards.shape), "Invalid shape: " + str(
                        training_samples.not_terminal.shape)
            if training_samples.possible_next_actions_mask is not None:
                assert (
                    training_samples.possible_next_actions_mask.shape ==
                    training_samples.actions.shape), (
                        "Invalid shape: " +
                        str(training_samples.possible_next_actions_mask.shape))
            if training_samples.propensities is not None:
                assert (training_samples.propensities.shape == training_samples
                        .rewards.shape), "Invalid shape: " + str(
                            training_samples.propensities.shape)
            if training_samples.metrics is not None:
                assert (
                    training_samples.metrics.shape[0] == self.minibatch_size
                ), "Invalid shape: " + str(training_samples.metrics.shape)

        boosted_rewards = self.boost_rewards(training_samples.rewards,
                                             training_samples.actions)

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminal

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        all_next_q_values, all_next_q_values_target = self.get_detached_q_values(
            training_samples.next_states)
        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            next_q_values, max_q_action_idxs = self.get_max_q_values(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.possible_next_actions_mask,
            )
        else:
            # SARSA
            next_q_values, max_q_action_idxs = self.get_max_q_values(
                all_next_q_values,
                all_next_q_values_target,
                training_samples.next_actions,
            )

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor *
                                         filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = all_q_values.detach()
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        if training_samples.metrics is None:
            metrics_reward_concat_real_vals = training_samples.rewards
        else:
            metrics_reward_concat_real_vals = torch.cat(
                (training_samples.metrics, training_samples.rewards), dim=1)

        ######### Train separate reward network for CPE evaluation #############
        reward_estimates = self.reward_network(states)
        logged_action_idxs = actions.argmax(dim=1, keepdim=True)
        reward_estimates_for_logged_actions = reward_estimates.gather(
            1, self.reward_idx_offsets + logged_action_idxs)
        reward_loss = F.mse_loss(reward_estimates_for_logged_actions,
                                 metrics_reward_concat_real_vals)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        ######### Train separate q-network for CPE evaluation #############
        metric_q_values = self.q_network_cpe(states).gather(
            1, self.reward_idx_offsets + logged_action_idxs)
        metric_target_q_values = self.q_network_cpe_target(states).detach()
        max_q_values_metrics = metric_target_q_values.gather(
            1, self.reward_idx_offsets + max_q_action_idxs)
        filtered_max_q_values_metrics = max_q_values_metrics * not_done_mask
        if self.minibatch < self.reward_burnin:
            target_metric_q_values = metrics_reward_concat_real_vals
        else:
            target_metric_q_values = metrics_reward_concat_real_vals + (
                discount_tensor * filtered_max_q_values_metrics)
        metric_q_value_loss = self.q_network_loss(metric_q_values,
                                                  target_metric_q_values)
        self.q_network_cpe.zero_grad()
        metric_q_value_loss.backward()
        self.q_network_cpe_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network_cpe, self.q_network_cpe_target,
                              1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network_cpe, self.q_network_cpe_target,
                              self.tau)

        model_propensities = torch.from_numpy(
            Evaluator.softmax(self.all_action_scores.cpu().numpy(),
                              self.rl_temperature))
        self.loss_reporter.report(
            td_loss=self.loss,
            reward_loss=reward_loss,
            logged_actions=logged_action_idxs,
            logged_propensities=training_samples.propensities,
            logged_rewards=rewards,
            logged_values=None,  # Compute at end of each epoch for CPE
            model_propensities=model_propensities,
            model_rewards=reward_estimates[:,
                                           torch.arange(
                                               self.reward_idx_offsets[0],
                                               self.reward_idx_offsets[0] +
                                               self.num_actions,
                                           ), ],
            model_values=self.all_action_scores,
            model_values_on_logged_actions=
            None,  # Compute at end of each epoch for CPE
            model_action_idxs=self.all_action_scores.argmax(dim=1,
                                                            keepdim=True),
        )

        training_metadata = {}
        training_metadata["model_rewards"] = reward_estimates.detach().cpu(
        ).numpy()
        return training_metadata