def __init__(self, neural_network=None, session=None, state=None, random=None, action_count=1, scope='policy'): super(CategoricalOneHotPolicy, self).__init__(neural_network, session, state, random, action_count) self.dist = Categorical(random) with tf.variable_scope(scope): self.action_layer = linear(self.neural_network.get_output(), {'num_outputs': self.action_count}, 'outputs') self.outputs = tf.nn.softmax(self.action_layer) self.output_sample = tf.multinomial(self.outputs, 1)
def __init__(self, network, session, state, random, action_count=1, scope='policy'): with tf.variable_scope(scope): logits = linear(layer_input=network.output, config={'num_outputs': action_count}, scope='outputs') distribution = tf.nn.softmax(logits) sample = tf.map_fn( lambda t: tf.multinomial(logits=t, num_samples=1), elems=logits, dtype=tf.int64) super(CategoricalOneHotPolicy, self).__init__(network, [distribution, sample], session, state, random, action_count) self.dist = Categorical(random)
def __init__(self, network, session, state, random, action_count=1, scope='policy'): with tf.variable_scope(scope): action_means = linear(network.output, {'num_outputs': action_count}, 'action_mu') # Random init for log standard deviations log_standard_devs_init = tf.Variable( 0.01 * random.randn(1, 1, action_count), dtype=tf.float32) action_log_stds = tf.tile( log_standard_devs_init, (tf.shape(action_means)[0], tf.shape(action_means)[1], 1)) super(GaussianPolicy, self).__init__(network, [action_means, action_log_stds], session, state, random, action_count) self.dist = Gaussian(random)
def __init__(self, neural_network=None, session=None, state=None, random=None, action_count=1, scope='policy'): super(GaussianPolicy, self).__init__(neural_network, session, state, random, action_count) self.dist = Gaussian(random) with tf.variable_scope(scope): self.action_means = linear(self.neural_network.get_output(), {'num_outputs': self.action_count}, 'action_mu') # Random init for log standard deviations log_standard_devs_init = tf.Variable( 0.01 * self.random.randn(1, self.action_count), dtype=tf.float32) self.action_log_stds = tf.tile( log_standard_devs_init, tf.stack((tf.shape(self.action_means)[0], 1)))
def create_outputs(self, last_hidden_layer, scope): """ Creates NAF specific outputs. :param last_hidden_layer: Points to last hidden layer :param scope: TF name scope :return Output variables and all TF variables created in this scope """ with tf.name_scope(scope): # State-value function v = linear(last_hidden_layer, {'num_outputs': 1, 'weights_regularizer': self.config.weights_regularizer, 'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'v') # Action outputs mu = linear(last_hidden_layer, {'num_outputs': self.action_count, 'weights_regularizer': self.config.weights_regularizer, 'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'mu') # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = int(self.action_count * (self.action_count + 1) / 2) l_entries = linear(last_hidden_layer, {'num_outputs': lower_triangular_size, 'weights_regularizer': self.config.weights_regularizer, 'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'l') # Iteratively construct matrix. Extra verbose comment here l_rows = [] offset = 0 for i in xrange(self.action_count): # Diagonal elements are exponentiated, otherwise gradient often 0 # Slice out lower triangular entries from flat representation through moving offset diagonal = tf.exp(tf.slice(l_entries, (0, offset), (-1, 1))) n = self.action_count - i - 1 # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal non_diagonal = tf.slice(l_entries, (0, offset + 1), (-1, n)) # Fill up row with zeros row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0))) offset += (self.action_count - i) l_rows.append(row) # Stack rows to matrix l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1)) # P = LL^T p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1))) # Need to adjust dimensions to multiply with P. action_diff = tf.expand_dims(self.actions - mu, -1) # A = -0.5 (a - mu)P(a - mu) advantage = -0.5 * tf.matmul(tf.transpose(action_diff, [0, 2, 1]), tf.matmul(p_matrix, action_diff)) advantage = tf.reshape(advantage, [-1, 1]) with tf.name_scope('q_values'): # Q = A + V q_value = v + advantage # Get all variables under this scope for target network update return v, mu, advantage, q_value, get_variables(scope)