def gather_static_shared_stats(
            self, evaluation_dataset_as_transitions: List[Transition],
            batch_size: int, reward_model: Architecture,
            network_keys: List) -> None:
        all_reward_model_rewards = []
        all_old_policy_probs = []
        all_rewards = []
        all_actions = []

        for i in range(
                math.ceil(len(evaluation_dataset_as_transitions) /
                          batch_size)):
            batch = evaluation_dataset_as_transitions[i * batch_size:(i + 1) *
                                                      batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

        self.all_reward_model_rewards = np.concatenate(
            all_reward_model_rewards, axis=0)
        self.all_old_policy_probs = np.concatenate(all_old_policy_probs,
                                                   axis=0)
        self.all_rewards = np.concatenate(all_rewards, axis=0)
        self.all_actions = np.concatenate(all_actions, axis=0)

        # mark that static shared data was collected and ready to be used
        self.is_gathered_static_shared_data = True
    def improve_reward_model(self, epochs: int):
        """
        Train a reward model to be used by the doubly-robust estimator

        :param epochs: The total number of epochs to use for training a reward model
        :return: None
        """
        batch_size = self.ap.network_wrappers['reward_model'].batch_size
        network_keys = self.ap.network_wrappers[
            'reward_model'].input_embedders_parameters.keys()

        # this is fitted from the training dataset
        for epoch in range(epochs):
            loss = 0
            total_transitions_processed = 0
            for i, batch in enumerate(
                    self.call_memory('get_shuffled_data_generator',
                                     batch_size)):
                batch = Batch(batch)
                current_rewards_prediction_for_all_actions = self.networks[
                    'reward_model'].online_network.predict(
                        batch.states(network_keys))
                current_rewards_prediction_for_all_actions[
                    range(batch.size), batch.actions()] = batch.rewards()
                loss += self.networks['reward_model'].train_and_sync_networks(
                    batch.states(network_keys),
                    current_rewards_prediction_for_all_actions)[0]
                total_transitions_processed += batch.size

            log = OrderedDict()
            log['Epoch'] = epoch
            log['loss'] = loss / total_transitions_processed
            screen.log_dict(log, prefix='Training Reward Model')
Exemple #3
0
    def get_reward_model_loss(self, batch: Batch):
        network_keys = self.ap.network_wrappers[
            'reward_model'].input_embedders_parameters.keys()
        current_rewards_prediction_for_all_actions = self.networks[
            'reward_model'].online_network.predict(batch.states(network_keys))
        current_rewards_prediction_for_all_actions[
            range(batch.size), batch.actions()] = batch.rewards()

        return self.networks['reward_model'].train_and_sync_networks(
            batch.states(network_keys),
            current_rewards_prediction_for_all_actions)[0]
Exemple #4
0
    def fill_advantages(self, batch):
        batch = Batch(batch)
        network_keys = self.ap.network_wrappers[
            'critic'].input_embedders_parameters.keys()

        # * Found not to have any impact *
        # current_states_with_timestep = self.concat_state_and_timestep(batch)

        current_state_values = self.networks['critic'].online_network.predict(
            batch.states(network_keys)).squeeze()
        total_returns = batch.n_step_discounted_rewards()
        # calculate advantages
        advantages = []
        if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
            advantages = total_returns - current_state_values
        elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
            # get bootstraps
            episode_start_idx = 0
            advantages = np.array([])
            # current_state_values[batch.game_overs()] = 0
            for idx, game_over in enumerate(batch.game_overs()):
                if game_over:
                    # get advantages for the rollout
                    value_bootstrapping = np.zeros((1, ))
                    rollout_state_values = np.append(
                        current_state_values[episode_start_idx:idx + 1],
                        value_bootstrapping)

                    rollout_advantages, _ = \
                        self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
                                                                     rollout_state_values)
                    episode_start_idx = idx + 1
                    advantages = np.append(advantages, rollout_advantages)
        else:
            screen.warning(
                "WARNING: The requested policy gradient rescaler is not available"
            )

        # standardize
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # TODO: this will be problematic with a shared memory
        for transition, advantage in zip(self.memory.transitions, advantages):
            transition.info['advantage'] = advantage

        self.action_advantages.add_sample(advantages)
Exemple #5
0
    def _prepare_ope_shared_stats(dataset_as_transitions: List[Transition],
                                  batch_size: int, reward_model: Architecture,
                                  q_network: Architecture,
                                  network_keys: List) -> OpeSharedStats:
        """
        Do the preparations needed for different estimators.
        Some of the calcuations are shared, so we centralize all the work here.

        :param dataset_as_transitions: The evaluation dataset in the form of transitions.
        :param batch_size: The batch size to use.
        :param reward_model: A reward model to be used by DR
        :param q_network: The Q network whose its policy we evaluate.
        :param network_keys: The network keys used for feeding the neural networks.
        :return:
        """
        # IPS
        all_reward_model_rewards, all_policy_probs, all_old_policy_probs = [], [], []
        all_v_values_reward_model_based, all_v_values_q_model_based, all_rewards, all_actions = [], [], [], []

        for i in range(math.ceil(len(dataset_as_transitions) / batch_size)):
            batch = dataset_as_transitions[i * batch_size:(i + 1) * batch_size]
            batch_for_inference = Batch(batch)

            all_reward_model_rewards.append(
                reward_model.predict(batch_for_inference.states(network_keys)))

            # we always use the first Q head to calculate OPEs. might want to change this in the future.
            # for instance, this means that for bootstrapped we always use the first QHead to calculate the OPEs.
            q_values, sm_values = q_network.predict(
                batch_for_inference.states(network_keys),
                outputs=[
                    q_network.output_heads[0].q_values,
                    q_network.output_heads[0].softmax
                ])

            all_policy_probs.append(sm_values)
            all_v_values_reward_model_based.append(
                np.sum(all_policy_probs[-1] * all_reward_model_rewards[-1],
                       axis=1))
            all_v_values_q_model_based.append(
                np.sum(all_policy_probs[-1] * q_values, axis=1))
            all_rewards.append(batch_for_inference.rewards())
            all_actions.append(batch_for_inference.actions())
            all_old_policy_probs.append(
                batch_for_inference.info('all_action_probabilities')[
                    range(len(batch_for_inference.actions())),
                    batch_for_inference.actions()])

            for j, t in enumerate(batch):
                t.update_info({
                    'q_value':
                    q_values[j],
                    'softmax_policy_prob':
                    all_policy_probs[-1][j],
                    'v_value_q_model_based':
                    all_v_values_q_model_based[-1][j],
                })

        all_reward_model_rewards = np.concatenate(all_reward_model_rewards,
                                                  axis=0)
        all_policy_probs = np.concatenate(all_policy_probs, axis=0)
        all_v_values_reward_model_based = np.concatenate(
            all_v_values_reward_model_based, axis=0)
        all_rewards = np.concatenate(all_rewards, axis=0)
        all_actions = np.concatenate(all_actions, axis=0)
        all_old_policy_probs = np.concatenate(all_old_policy_probs, axis=0)

        # generate model probabilities
        new_policy_prob = all_policy_probs[np.arange(all_actions.shape[0]),
                                           all_actions]
        rho_all_dataset = new_policy_prob / all_old_policy_probs

        return OpeSharedStats(all_reward_model_rewards, all_policy_probs,
                              all_v_values_reward_model_based, all_rewards,
                              all_actions, all_old_policy_probs,
                              new_policy_prob, rho_all_dataset)