def merge_episodes(self, batch):
        """
        Merge episodes of a batch into single input variables.

        :param batch:
        :return:
        """
        if self.continuous:
            action_log_stds = np.concatenate(
                [path['action_log_stds'] for path in batch])
            action_log_stds = np.expand_dims(action_log_stds, axis=1)
        else:
            action_log_stds = None

        action_means = np.concatenate([path['action_means'] for path in batch])
        actions = np.concatenate([path['actions'] for path in batch])
        batch_advantage = np.concatenate([path["advantage"] for path in batch])

        if self.normalize_advantage:
            batch_advantage = zero_mean_unit_variance(batch_advantage)

        batch_advantage = np.expand_dims(batch_advantage, axis=1)
        states = np.concatenate([path['states'] for path in batch])

        return action_log_stds, action_means, actions, batch_advantage, states
Example #2
0
 def process(self, state):
     """
     Standardize the data.
     :param state: state input
     :return: new_state
     """
     return zero_mean_unit_variance(state.astype(np.float32))
Example #3
0
    def generalised_advantage_estimation(self, episode):
        """
         Expects an episode, returns advantages according to config.
        """
        baseline = self.baseline_value_function.predict(episode)

        if self.generalized_advantage_estimation:
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])
            deltas = episode['rewards'] + self.gamma * adjusted_baseline[
                1:] - adjusted_baseline[:-1]
            advantage = discount(deltas, self.gamma * self.gae_lambda)
        else:
            advantage = episode['returns'] - baseline

        if self.normalize_advantage:
            return zero_mean_unit_variance(advantage)
        else:
            return advantage