Beispiel #1
0
    def _build_policy_head(self, input_state):
        self.adv_actor_ph = tf.placeholder("float", [None], name='advantage')
        self.w_mu, self.b_mu, self.mu = layers.fc('mean',
                                                  input_state,
                                                  self.num_actions,
                                                  activation='linear')
        self.sigma, dist_params = self._build_sigma(input_state)

        self.dist = DiagNormal(dist_params)
        self.log_output_selected_action = self.dist.log_likelihood(
            self.selected_action_ph)

        self.output_layer_entropy = self.dist.entropy()
        self.entropy = tf.reduce_sum(self.output_layer_entropy)

        self.actor_objective = -tf.reduce_sum(
            self.log_output_selected_action * self.adv_actor_ph +
            self.beta * self.output_layer_entropy)
        self.sample_action = self.dist.sample()

        return self.actor_objective
    def _build_policy_head(self, input_state):
        self.adv_actor_ph = tf.placeholder("float", [None], name='advantage')       
        self.w_mu, self.b_mu, self.mu = layers.fc(
            'mean', input_state, self.num_actions, activation='linear')
        self.sigma = self._build_sigma(input_state)

        self.N = DiagNormal(self.mu, self.sigma)
        self.log_output_selected_action = self.N.log_likelihood(self.selected_action_ph)
        self.log_output_selected_action = tf.expand_dims(self.log_output_selected_action, 1)
        
        self.output_layer_entropy = self.N.entropy()
        self.entropy = tf.reduce_sum(self.output_layer_entropy)

        self.actor_objective = -tf.reduce_sum(
            self.log_output_selected_action * self.adv_actor_ph
            + self.beta * self.output_layer_entropy
        )
        self.sample_action = self.N.sample()
        # self.sample_action = tf.Print(self.sample_action, [self.sample_action], 'Action: ')

        return self.actor_objective
Beispiel #3
0
class ContinuousPolicyValueNetwork(PolicyValueNetwork):
    '''
    Shared policy-value network with polciy head parametrizing
    multivariate normal with diagonal covariance
    '''
    def __init__(self, conf, **kwargs):
        self.action_space = conf['args'].action_space
        self.use_state_dependent_std = False
        super(ContinuousPolicyValueNetwork, self).__init__(conf, **kwargs)

    def _build_policy_head(self, input_state):
        self.adv_actor_ph = tf.placeholder("float", [None], name='advantage')
        self.w_mu, self.b_mu, self.mu = layers.fc('mean',
                                                  input_state,
                                                  self.num_actions,
                                                  activation='linear')
        self.sigma, dist_params = self._build_sigma(input_state)

        self.dist = DiagNormal(dist_params)
        self.log_output_selected_action = self.dist.log_likelihood(
            self.selected_action_ph)

        self.output_layer_entropy = self.dist.entropy()
        self.entropy = tf.reduce_sum(self.output_layer_entropy)

        self.actor_objective = -tf.reduce_sum(
            self.log_output_selected_action * self.adv_actor_ph +
            self.beta * self.output_layer_entropy)
        self.sample_action = self.dist.sample()

        return self.actor_objective

    def _build_sigma(self, input_state):
        if self.use_state_dependent_std:
            self.w_sigma2, self.b_sigma2, self.sigma_hat = layers.fc(
                'std2', input_state, self.num_actions, activation='linear')
            self.sigma2 = tf.log(1 + tf.exp(self.sigma_hat))
            sigma = tf.sqrt(self.sigma2 + 1e-8)
            return sigma, tf.concat([self.mu, sigma], 1)
        else:
            self.log_sigma = tf.get_variable(
                'log_sigma',
                self.mu.get_shape().as_list()[1],
                dtype=tf.float32,
                initializer=tf.random_uniform_initializer(-2, -1))
            sigma = tf.expand_dims(tf.exp(self.log_sigma), 0)
            tiled_sigma = tf.tile(sigma, [tf.shape(self.mu)[0], 1])
            return sigma, tf.concat([self.mu, tiled_sigma], 1)

    def get_action(self, session, state, lstm_state=None):
        feed_dict = {self.input_ph: [state]}
        if lstm_state is not None:
            feed_dict[self.step_size] = [1]
            feed_dict[self.initial_lstm_state] = lstm_state

            action, lstm_state, params = session.run(
                [self.sample_action, self.lstm_state,
                 self.dist.params()],
                feed_dict=feed_dict)

            return action[0], params, lstm_state
        else:
            action, params = session.run(
                [self.sample_action, self.dist.params()], feed_dict=feed_dict)

            return action[0], params[0]

    def get_action_and_value(self, session, state, lstm_state=None):
        feed_dict = {self.input_ph: [state]}
        if lstm_state is not None:
            feed_dict[self.step_size] = [1]
            feed_dict[self.initial_lstm_state] = lstm_state

            action, v, lstm_state, params = session.run([
                self.sample_action, self.output_layer_v, self.lstm_state,
                self.dist.params()
            ],
                                                        feed_dict=feed_dict)

            return action[0], v[0, 0], params[0], lstm_state
        else:
            action, v, params = session.run(
                [self.sample_action, self.output_layer_v,
                 self.dist.params()],
                feed_dict=feed_dict)

            return action[0], v[0, 0], params[0]