def reward_estimation(self, states, rewards, terminals):
        """Process rewards according to the configuration.

        Args:
            states:
            rewards:
            terminals:

        Returns:

        """
        discounted_rewards = util.cumulative_discount(
            values=rewards,
            terminals=terminals,
            discount=self.discount
        )

        if self.baseline:
            state_values = list()
            for name, state in states.items():
                state_value = self.baseline[name].predict(states=state)
                state_values.append(state_value)

            state_values = np.mean(state_values, axis=0)

            if self.gae_rewards:
                td_residuals = rewards + np.array(
                    [self.discount * state_values[n + 1] - state_values[n] if (n < len(state_values) - 1 and not terminal) else 0.0 for n, terminal in enumerate(terminals)])
                rewards = util.cumulative_discount(
                    values=td_residuals,
                    terminals=terminals,
                    discount=(self.discount * self.gae_lambda)
                )
            else:
                rewards = discounted_rewards - state_values
        else:
            rewards = discounted_rewards

        mean = rewards.mean()
        stddev = rewards.std()
        self.logger.debug('Reward mean {} and variance {}.'.format(mean, stddev * stddev))

        if self.normalize_rewards:
            rewards = (rewards - mean) / max(stddev, util.epsilon)

        self.logger.debug('First ten rewards: {}.'.format(rewards[:10]))

        return rewards, discounted_rewards
    def advantage_estimation(self, batch):
        """Expects a batch, returns advantages according to config.

        Args:
            batch: 

        Returns:

        """
        if not self.baseline:
            return batch['returns']

        estimates = self.baseline.predict(states=batch['states'])
        if self.generalized_advantage_estimation:
            deltas = np.array([
                self.discount * estimates[n + 1] - estimates[n] if
                (n < len(estimates) - 1 and not terminal) else 0.0
                for n, terminal in enumerate(batch['terminals'])
            ])
            deltas += batch['rewards']
            advantage = util.cumulative_discount(rewards=deltas,
                                                 terminals=batch['terminals'],
                                                 discount=(self.discount *
                                                           self.gae_lambda))
        else:
            advantage = np.array(batch['returns']) - estimates

        if self.normalize_advantage:
            advantage -= advantage.mean()
            advantage /= advantage.std() + 1e-8

        return advantage
Beispiel #3
0
    def reward_estimation(self, states, rewards, terminals):
        """Process rewards according to the configuration.

        Args:
            states:
            rewards:
            terminals:

        Returns:

        """
        discounted_rewards = util.cumulative_discount(
            values=rewards,
            terminals=terminals,
            discount=self.discount
        )

        if self.baseline:
            state_values = []
            for name, state in states.items():
                state_values.append(self.baseline[name].predict(states=state))

            state_values = np.mean(state_values, axis=0)
            if self.gae_rewards:
                td_residuals = rewards + np.array(
                    [self.discount * state_values[n + 1] - state_values[n] if (n < len(state_values) - 1 and not terminal) else 0.0 for n, terminal in enumerate(terminals)])
                rewards = util.cumulative_discount(
                    values=td_residuals,
                    terminals=terminals,
                    discount=(self.discount * self.gae_lambda)
                )
            else:
                rewards = discounted_rewards - state_values
        else:
            rewards = discounted_rewards

        if self.normalize_rewards:
            rewards = (rewards - rewards.mean()) / max(rewards.std(), util.epsilon)

        return rewards, discounted_rewards
    def update(self, batch):
        """Generic policy gradient update on a batch of experiences. Each model needs to update its specific
        logic.
        
        Args:
            batch: 

        Returns:

        """
        batch['returns'] = util.cumulative_discount(rewards=batch['rewards'], terminals=batch['terminals'], discount=self.discount)
        batch['rewards'] = self.advantage_estimation(batch)
        if self.baseline:
            self.baseline.update(states=batch['states'], returns=batch['returns'])
        super(PolicyGradientModel, self).update(batch)
Beispiel #5
0
    def update_feed_dict(self, batch):
        # assume temporally consistent sequence
        # get state time + 1
        feed_dict = {
            next_state: [batch['states'][name][-1]]
            for name, next_state in self.next_state.items()
        }
        feed_dict.update({
            internal: [batch['internals'][n][-1]]
            for n, internal in enumerate(self.next_internal_inputs,
                                         self.network_internal_index)
        })
        # calcualte nstep rewards
        target_q_vals = self.session.run(self.target_values,
                                         feed_dict=feed_dict)
        nstep_rewards = dict()

        for name, value in target_q_vals.items():
            nstep_rewards[name] = util.cumulative_discount(
                values=batch['rewards'][:-1],
                terminals=batch['terminals'][:-1],
                discount=self.discount,
                cumulative_start=target_q_vals[name][0])

        # create update feed dict
        feed_dict = {
            state: batch['states'][name][:-1]
            for name, state in self.state.items()
        }
        feed_dict.update({
            action: batch['actions'][name][:-1]
            for name, action in self.action.items()
        })
        feed_dict.update({
            internal: batch['internals'][n][:-1]
            for n, internal in enumerate(self.internal_inputs)
        })
        feed_dict.update({
            self.nstep_rewards[name]: nstep_rewards[name]
            for name, reward in nstep_rewards.items()
        })
        return feed_dict
    def advantage_estimation(self, batch):
        """Expects a batch, returns advantages according to config.

        Args:
            batch: 

        Returns:

        """
        if not self.baseline:
            return batch['returns']

        estimates = self.baseline.predict(states=batch['states'])
        if self.generalized_advantage_estimation:
            deltas = np.array(
                self.discount * estimates[n + 1] -
                estimates[n] if n < len(estimates) - 1 or terminal else 0.0
                for n, terminal in enumerate(batch['terminals']))
            deltas += batch['rewards']
            # if terminals[-1]:
            #     adjusted_estimate = np.append(estimate, [0])
            # else:
            #     adjusted_estimate = np.append(estimate, estimate[-1])
            # deltas = batch['rewards'] + self.discount * adjusted_estimate[1:] - adjusted_estimate[:-1]
            advantage = util.cumulative_discount(rewards=deltas,
                                                 terminals=batch['terminals'],
                                                 discount=(self.discount *
                                                           self.gae_lambda))
        else:
            advantage = batch['returns'] - estimates

        if self.normalize_advantage:
            advantage -= advantage.mean()
            advantage /= advantage.std() + 1e-8

        return advantage