def __init__(self, state, action, state_dims, action_dims, dense1_size, dense2_size, final_layer_init, num_atoms, v_min, v_max, is_training=False, scope='critic'): # state - State input to pass through the network # action - Action input for which the Z distribution should be predicted self.state = state self.action = action self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.is_training = is_training self.scope = scope with tf.variable_scope(self.scope): self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm') self.dense1_mul = dense(self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') #Merge first dense layer with action input to get second dense layer self.dense2a = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2a') self.dense2b = dense(self.action, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2b') self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2') self.output_logits = dense(self.dense2, num_atoms, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output_logits') self.output_probs = softmax(self.output_logits, scope='output_probs') self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name] self.z_atoms = tf.lin_space(v_min, v_max, num_atoms) self.Q_val = tf.reduce_sum(self.z_atoms * self.output_probs) # the Q value is the mean of the categorical output Z-distribution self.action_grads = tf.gradients(self.output_probs, self.action, self.z_atoms) # gradient of mean of output Z-distribution wrt action input - used to train actor network, weighing the grads by z_values gives the mean across the output distribution
def __init__(self, state, state_dims, action_dims, action_bound_low, action_bound_high, dense1_size, dense2_size, final_layer_init, is_training=False, scope='actor'): # state - State input to pass through the network # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space self.state = state self.state_dims = np.prod(state_dims) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.action_bound_low = action_bound_low self.action_bound_high = action_bound_high self.is_training = is_training self.scope = scope with tf.variable_scope(self.scope): self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm') self.dense1_mul = dense(self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') self.dense2_mul = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))), bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))), scope='dense2') self.dense2_bn = batchnorm(self.dense2_mul, self.is_training, scope='dense2') self.dense2 = relu(self.dense2_bn, scope='dense2') self.output_mul = dense(self.dense2, self.action_dims, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output') self.output_tanh = tanh(self.output_mul, scope='output') # Scale tanh output to lower and upper action bounds self.output = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.action_bound_high-self.action_bound_low)) + (self.action_bound_high+self.action_bound_low)) self.network_params = tf.trainable_variables(scope=self.scope) self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name]
def __init__(self, state, action, state_dims, action_dims, args, is_training=False, scope='critic'): # state - State input to pass through the network # action - Action input for which the Q value should be predicted self.state = state self.action = action self.state_dims = np.prod( state_dims ) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.args = args self.is_training = is_training self.scope = scope # Networks params dense1_size = self.args.dense1_size dense2_size = self.args.dense2_size final_layer_init = self.args.final_layer_init with tf.variable_scope(self.scope): self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm') self.dense1_mul = dense( self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') #Merge first dense layer with action input to get second dense layer self.dense2a = dense( self.dense1, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), scope='dense2a') self.dense2b = dense( self.action, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), 1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))), scope='dense2b') self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2') self.output = dense(self.dense2, 1, weight_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), scope='output') self.network_params = tf.trainable_variables(scope=self.scope) self.action_grads = tf.gradients( self.output, self.action ) # Gradient of value output wrt action input - used to train actor network
def __init__(self, state, audio, state_dims, action_dims, action_bound_low, action_bound_high, args, is_training=False, scope='actor'): # state - State input to pass through the network # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space self.state = state self.audio = audio self.state_dims = np.prod( state_dims ) #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6) self.action_dims = np.prod(action_dims) self.action_bound_low = action_bound_low self.action_bound_high = action_bound_high self.args = args self.is_training = is_training self.scope = scope # Networks params dense1_size = self.args.dense1_size dense2_size = self.args.dense2_size final_layer_init = self.args.final_layer_init with tf.variable_scope(self.scope): self.fc1_a = tf.layers.dense(self.audio, 50, tf.nn.relu) self.fc2_a = tf.layers.dense(self.fc1_a, 50, tf.nn.relu) self.final_flat = tf.concat( [createNetwork_cnn(self.state), self.fc2_a], 1) # self.state_dims = np.prod(self.final_flat) # print(np.shape(self.final_flat), self.state_dims) self.input_norm = batchnorm(self.final_flat, self.is_training, scope='input_norm') self.dense1_mul = dense( self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(self.state_dims))), 1 / tf.sqrt(tf.to_float(self.state_dims))), scope='dense1') self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1') self.dense1 = relu(self.dense1_bn, scope='dense1') self.dense2_mul = dense( self.dense1, dense2_size, weight_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size))), 1 / tf.sqrt(tf.to_float(dense1_size))), bias_init=tf.random_uniform_initializer( (-1 / tf.sqrt(tf.to_float(dense1_size))), 1 / tf.sqrt(tf.to_float(dense1_size))), scope='dense2') self.dense2_bn = batchnorm(self.dense2_mul, self.is_training, scope='dense2') self.dense2 = relu(self.dense2_bn, scope='dense2') self.output_mul = dense(self.dense2, self.action_dims, weight_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), bias_init=tf.random_uniform_initializer( -1 * final_layer_init, final_layer_init), scope='output') self.output_tanh = tanh(self.output_mul, scope='output') # Scale tanh output to lower and upper action bounds self.output = tf.multiply( 0.5, tf.multiply(self.output_tanh, (self.action_bound_high - self.action_bound_low)) + (self.action_bound_high + self.action_bound_low)) self.network_params = tf.trainable_variables(scope=self.scope)