コード例 #1
0
    def __init__(self, neural_network=None,
                 session=None,
                 state=None,
                 random=None,
                 action_count=1,
                 scope='policy'):
        super(CategoricalOneHotPolicy, self).__init__(neural_network, session, state, random, action_count)
        self.dist = Categorical(random)

        with tf.variable_scope(scope):
            self.action_layer = linear(self.neural_network.get_output(),
                                       {'num_outputs': self.action_count},
                                       'outputs')
            self.outputs = tf.nn.softmax(self.action_layer)
            self.output_sample = tf.multinomial(self.outputs, 1)
コード例 #2
0
    def __init__(self,
                 network,
                 session,
                 state,
                 random,
                 action_count=1,
                 scope='policy'):
        with tf.variable_scope(scope):
            logits = linear(layer_input=network.output,
                            config={'num_outputs': action_count},
                            scope='outputs')
            distribution = tf.nn.softmax(logits)
            sample = tf.map_fn(
                lambda t: tf.multinomial(logits=t, num_samples=1),
                elems=logits,
                dtype=tf.int64)

        super(CategoricalOneHotPolicy,
              self).__init__(network, [distribution, sample], session, state,
                             random, action_count)
        self.dist = Categorical(random)
コード例 #3
0
    def __init__(self,
                 network,
                 session,
                 state,
                 random,
                 action_count=1,
                 scope='policy'):
        with tf.variable_scope(scope):
            action_means = linear(network.output,
                                  {'num_outputs': action_count}, 'action_mu')
            # Random init for log standard deviations
            log_standard_devs_init = tf.Variable(
                0.01 * random.randn(1, 1, action_count), dtype=tf.float32)
            action_log_stds = tf.tile(
                log_standard_devs_init,
                (tf.shape(action_means)[0], tf.shape(action_means)[1], 1))

        super(GaussianPolicy,
              self).__init__(network, [action_means, action_log_stds], session,
                             state, random, action_count)
        self.dist = Gaussian(random)
コード例 #4
0
    def __init__(self,
                 neural_network=None,
                 session=None,
                 state=None,
                 random=None,
                 action_count=1,
                 scope='policy'):
        super(GaussianPolicy, self).__init__(neural_network, session, state,
                                             random, action_count)
        self.dist = Gaussian(random)

        with tf.variable_scope(scope):
            self.action_means = linear(self.neural_network.get_output(),
                                       {'num_outputs': self.action_count},
                                       'action_mu')

            # Random init for log standard deviations
            log_standard_devs_init = tf.Variable(
                0.01 * self.random.randn(1, self.action_count),
                dtype=tf.float32)

            self.action_log_stds = tf.tile(
                log_standard_devs_init,
                tf.stack((tf.shape(self.action_means)[0], 1)))
コード例 #5
0
    def create_outputs(self, last_hidden_layer, scope):
        """
        Creates NAF specific outputs.

        :param last_hidden_layer: Points to last hidden layer
        :param scope: TF name scope

        :return Output variables and all TF variables created in this scope
        """

        with tf.name_scope(scope):
            # State-value function
            v = linear(last_hidden_layer, {'num_outputs': 1, 'weights_regularizer': self.config.weights_regularizer,
                                           'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'v')

            # Action outputs
            mu = linear(last_hidden_layer, {'num_outputs': self.action_count, 'weights_regularizer': self.config.weights_regularizer,
                                            'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'mu')

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = int(self.action_count * (self.action_count + 1) / 2)
            l_entries = linear(last_hidden_layer, {'num_outputs': lower_triangular_size,
                                                   'weights_regularizer': self.config.weights_regularizer,
                                                   'weights_regularizer_args': [self.config.weights_regularizer_args]},
                               scope + 'l')

            # Iteratively construct matrix. Extra verbose comment here
            l_rows = []
            offset = 0

            for i in xrange(self.action_count):
                # Diagonal elements are exponentiated, otherwise gradient often 0
                # Slice out lower triangular entries from flat representation through moving offset

                diagonal = tf.exp(tf.slice(l_entries, (0, offset), (-1, 1)))

                n = self.action_count - i - 1
                # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal
                non_diagonal = tf.slice(l_entries, (0, offset + 1), (-1, n))

                # Fill up row with zeros
                row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0)))
                offset += (self.action_count - i)
                l_rows.append(row)

            # Stack rows to matrix
            l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1))

            # P = LL^T
            p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1)))

            # Need to adjust dimensions to multiply with P.
            action_diff = tf.expand_dims(self.actions - mu, -1)

            # A = -0.5 (a - mu)P(a - mu)
            advantage = -0.5 * tf.matmul(tf.transpose(action_diff, [0, 2, 1]),
                                               tf.matmul(p_matrix, action_diff))
            advantage = tf.reshape(advantage, [-1, 1])

            with tf.name_scope('q_values'):
                # Q = A + V
                q_value = v + advantage

        # Get all variables under this scope for target network update
        return v, mu, advantage, q_value, get_variables(scope)