Ejemplo n.º 1
0
    def __init__(self, state, action, state_dims, action_dims, dense1_size, dense2_size, final_layer_init, num_atoms, v_min, v_max, is_training=False, scope='critic'):
        # state - State input to pass through the network
        # action - Action input for which the Z distribution should be predicted
        
        self.state = state
        self.action = action
        self.state_dims = np.prod(state_dims)       #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
        self.action_dims = np.prod(action_dims)
        self.is_training = is_training
        self.scope = scope    

        
        with tf.variable_scope(self.scope):
            self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm')
           
            self.dense1_mul = dense(self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))),
                                bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1')  
            
            self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1')
            
            self.dense1 = relu(self.dense1_bn, scope='dense1')
            
            #Merge first dense layer with action input to get second dense layer            
            self.dense2a = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),
                                bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2a')        
            
            self.dense2b = dense(self.action, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))),
                                bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), 1/tf.sqrt(tf.to_float(dense1_size+self.action_dims))), scope='dense2b') 
            
            self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2')
            
            self.output_logits = dense(self.dense2, num_atoms, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init),
                                       bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output_logits')  
            
            self.output_probs = softmax(self.output_logits, scope='output_probs')
                         
                          
            self.network_params = tf.trainable_variables(scope=self.scope)
            self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name]
            
            
            self.z_atoms = tf.lin_space(v_min, v_max, num_atoms)
            
            self.Q_val = tf.reduce_sum(self.z_atoms * self.output_probs) # the Q value is the mean of the categorical output Z-distribution
          
            self.action_grads = tf.gradients(self.output_probs, self.action, self.z_atoms) # gradient of mean of output Z-distribution wrt action input - used to train actor network, weighing the grads by z_values gives the mean across the output distribution
Ejemplo n.º 2
0
	def __init__(self, state, state_dims, action_dims, action_bound_low, action_bound_high, dense1_size, dense2_size, final_layer_init, is_training=False, scope='actor'):
		# state - State input to pass through the network
		# action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space
		
		self.state = state
		self.state_dims = np.prod(state_dims)       #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
		self.action_dims = np.prod(action_dims)
		self.action_bound_low = action_bound_low
		self.action_bound_high = action_bound_high
		self.is_training = is_training
		self.scope = scope
		
		with tf.variable_scope(self.scope):
		
			self.input_norm = batchnorm(self.state, self.is_training, scope='input_norm')
		   
			self.dense1_mul = dense(self.input_norm, dense1_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))),
								bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(self.state_dims))), 1/tf.sqrt(tf.to_float(self.state_dims))), scope='dense1')  
			
			self.dense1_bn = batchnorm(self.dense1_mul, self.is_training, scope='dense1')
			
			self.dense1 = relu(self.dense1_bn, scope='dense1')
			
			self.dense2_mul = dense(self.dense1, dense2_size, weight_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))),
								bias_init=tf.random_uniform_initializer((-1/tf.sqrt(tf.to_float(dense1_size))), 1/tf.sqrt(tf.to_float(dense1_size))), scope='dense2')        
			
			self.dense2_bn = batchnorm(self.dense2_mul, self.is_training, scope='dense2')
			
			self.dense2 = relu(self.dense2_bn, scope='dense2')
			
			self.output_mul = dense(self.dense2, self.action_dims, weight_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init),
								bias_init=tf.random_uniform_initializer(-1*final_layer_init, final_layer_init), scope='output') 
			
			self.output_tanh = tanh(self.output_mul, scope='output')
			
			# Scale tanh output to lower and upper action bounds
			self.output = tf.multiply(0.5, tf.multiply(self.output_tanh, (self.action_bound_high-self.action_bound_low)) + (self.action_bound_high+self.action_bound_low))
			
		   
			self.network_params = tf.trainable_variables(scope=self.scope)
			self.bn_params = [v for v in tf.global_variables(scope=self.scope) if 'batch_normalization/moving' in v.name]
Ejemplo n.º 3
0
    def __init__(self,
                 state,
                 action,
                 state_dims,
                 action_dims,
                 args,
                 is_training=False,
                 scope='critic'):
        # state - State input to pass through the network
        # action - Action input for which the Q value should be predicted

        self.state = state
        self.action = action
        self.state_dims = np.prod(
            state_dims
        )  #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
        self.action_dims = np.prod(action_dims)
        self.args = args
        self.is_training = is_training
        self.scope = scope

        # Networks params
        dense1_size = self.args.dense1_size
        dense2_size = self.args.dense2_size
        final_layer_init = self.args.final_layer_init

        with tf.variable_scope(self.scope):
            self.input_norm = batchnorm(self.state,
                                        self.is_training,
                                        scope='input_norm')

            self.dense1_mul = dense(
                self.input_norm,
                dense1_size,
                weight_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(self.state_dims))),
                    1 / tf.sqrt(tf.to_float(self.state_dims))),
                bias_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(self.state_dims))),
                    1 / tf.sqrt(tf.to_float(self.state_dims))),
                scope='dense1')

            self.dense1_bn = batchnorm(self.dense1_mul,
                                       self.is_training,
                                       scope='dense1')

            self.dense1 = relu(self.dense1_bn, scope='dense1')

            #Merge first dense layer with action input to get second dense layer
            self.dense2a = dense(
                self.dense1,
                dense2_size,
                weight_init=tf.random_uniform_initializer(
                    (-1 /
                     tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                    1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                bias_init=tf.random_uniform_initializer(
                    (-1 /
                     tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                    1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                scope='dense2a')

            self.dense2b = dense(
                self.action,
                dense2_size,
                weight_init=tf.random_uniform_initializer(
                    (-1 /
                     tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                    1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                bias_init=tf.random_uniform_initializer(
                    (-1 /
                     tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                    1 / tf.sqrt(tf.to_float(dense1_size + self.action_dims))),
                scope='dense2b')

            self.dense2 = relu(self.dense2a + self.dense2b, scope='dense2')

            self.output = dense(self.dense2,
                                1,
                                weight_init=tf.random_uniform_initializer(
                                    -1 * final_layer_init, final_layer_init),
                                bias_init=tf.random_uniform_initializer(
                                    -1 * final_layer_init, final_layer_init),
                                scope='output')

            self.network_params = tf.trainable_variables(scope=self.scope)

            self.action_grads = tf.gradients(
                self.output, self.action
            )  # Gradient of value output wrt action input - used to train actor network
Ejemplo n.º 4
0
    def __init__(self,
                 state,
                 audio,
                 state_dims,
                 action_dims,
                 action_bound_low,
                 action_bound_high,
                 args,
                 is_training=False,
                 scope='actor'):
        # state - State input to pass through the network
        # action_bounds - Network will output in range [-1,1]. Multiply this by action_bound to get output within desired boundaries of action space

        self.state = state
        self.audio = audio
        self.state_dims = np.prod(
            state_dims
        )  #Used to calculate the fan_in of the state layer (e.g. if state_dims is (3,2) fan_in should equal 6)
        self.action_dims = np.prod(action_dims)
        self.action_bound_low = action_bound_low
        self.action_bound_high = action_bound_high
        self.args = args
        self.is_training = is_training
        self.scope = scope

        # Networks params
        dense1_size = self.args.dense1_size
        dense2_size = self.args.dense2_size
        final_layer_init = self.args.final_layer_init

        with tf.variable_scope(self.scope):
            self.fc1_a = tf.layers.dense(self.audio, 50, tf.nn.relu)
            self.fc2_a = tf.layers.dense(self.fc1_a, 50, tf.nn.relu)
            self.final_flat = tf.concat(
                [createNetwork_cnn(self.state), self.fc2_a], 1)
            # self.state_dims = np.prod(self.final_flat)
            # print(np.shape(self.final_flat), self.state_dims)

            self.input_norm = batchnorm(self.final_flat,
                                        self.is_training,
                                        scope='input_norm')

            self.dense1_mul = dense(
                self.input_norm,
                dense1_size,
                weight_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(self.state_dims))),
                    1 / tf.sqrt(tf.to_float(self.state_dims))),
                bias_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(self.state_dims))),
                    1 / tf.sqrt(tf.to_float(self.state_dims))),
                scope='dense1')

            self.dense1_bn = batchnorm(self.dense1_mul,
                                       self.is_training,
                                       scope='dense1')

            self.dense1 = relu(self.dense1_bn, scope='dense1')

            self.dense2_mul = dense(
                self.dense1,
                dense2_size,
                weight_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(dense1_size))),
                    1 / tf.sqrt(tf.to_float(dense1_size))),
                bias_init=tf.random_uniform_initializer(
                    (-1 / tf.sqrt(tf.to_float(dense1_size))),
                    1 / tf.sqrt(tf.to_float(dense1_size))),
                scope='dense2')

            self.dense2_bn = batchnorm(self.dense2_mul,
                                       self.is_training,
                                       scope='dense2')

            self.dense2 = relu(self.dense2_bn, scope='dense2')

            self.output_mul = dense(self.dense2,
                                    self.action_dims,
                                    weight_init=tf.random_uniform_initializer(
                                        -1 * final_layer_init,
                                        final_layer_init),
                                    bias_init=tf.random_uniform_initializer(
                                        -1 * final_layer_init,
                                        final_layer_init),
                                    scope='output')

            self.output_tanh = tanh(self.output_mul, scope='output')

            # Scale tanh output to lower and upper action bounds
            self.output = tf.multiply(
                0.5,
                tf.multiply(self.output_tanh,
                            (self.action_bound_high - self.action_bound_low)) +
                (self.action_bound_high + self.action_bound_low))

            self.network_params = tf.trainable_variables(scope=self.scope)