def _setup_stochastic_policy(self, obs, action, reuse=False, scope="pi"): """Create the variables of a stochastic policy. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the policy """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output mean policy_mean = layer( pi_h, self.ac_space.shape[0], 'mean', act_fun=None, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) # create the output log_std log_std = layer( pi_h, self.ac_space.shape[0], 'log_std', act_fun=None, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) _, policy, _ = apply_squashing_func(policy_mean, policy, logp_pi) # Store the variables under their respective parameters. self.policy = policy self.logp_ac = logp_ac
def make_actor(self, obs, reuse=False, scope="pi"): """Create an actor tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the actor """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: pi_h = self._remove_fingerprint(pi_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0]) # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer policy = layer(pi_h, self.ac_space.shape[0], 'output', act_fun=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3)) # scaling terms to the output from the policy ac_means = (self.ac_space.high + self.ac_space.low) / 2. ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2. policy = ac_means + ac_magnitudes * tf.to_float(policy) return policy
def make_critic(self, obs, action, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the critic """ with tf.compat.v1.variable_scope(scope, reuse=reuse): # concatenate the observations and actions qf_h = tf.concat([obs, action], axis=-1) # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: qf_h = self._remove_fingerprint( qf_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0] + self.ac_space.shape[0]) # create the hidden layers for i, layer_size in enumerate(self.layers): qf_h = layer(qf_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer qvalue_fn = layer(qf_h, 1, 'qf_output', kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3)) return qvalue_fn
def _setup_deterministic_policy(self, obs, reuse=False, scope="pi"): """Create the variables of deterministic a policy. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the policy """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer policy = layer(pi_h, self.ac_space.shape[0], 'output', act_fun=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3)) # scaling terms to the output from the policy ac_means = (self.ac_space.high + self.ac_space.low) / 2. ac_magnitudes = (self.ac_space.high - self.ac_space.low) / 2. policy = ac_means + ac_magnitudes * tf.to_float(policy) # Store the variables under their respective parameters. self.policy = policy
def make_critic(self, obs, action, reuse=False, scope="qf"): """Create a critic tensor. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str an outer scope term Returns ------- tf.Variable the output from the critic """ with tf.compat.v1.variable_scope(scope, reuse=reuse): # concatenate the observations and actions qf_h = tf.concat([obs, action], axis=-1) # create the hidden layers for i, layer_size in enumerate(self.layers): qf_h = layer(qf_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output layer qvalue_fn = layer(qf_h, 1, 'qf_output', kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3)) return qvalue_fn
def make_critic(self, obs, action=None, reuse=False, scope="value_fns", create_qf=True, create_vf=True): """Create the critic variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor create_qf : bool whether to create the Q-functions create_vf : bool whether to create the value function Returns ------- tf.Variable the output from the first Q-function. Set to None if `create_qf` is False. tf.Variable the output from the second Q-function. Set to None if `create_qf` is False. tf.Variable the output from the value function. Set to None if `create_vf` is False. """ with tf.compat.v1.variable_scope(scope, reuse=reuse): # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: obs = self._remove_fingerprint( obs, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0] ) # Value function if create_vf: with tf.compat.v1.variable_scope("vf", reuse=reuse): vf_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): vf_h = layer( vf_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm ) # create the output layer value_fn = layer( vf_h, 1, 'vf_output', kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3) ) else: value_fn = None # Double Q values to reduce overestimation if create_qf: with tf.compat.v1.variable_scope('qf1', reuse=reuse): # concatenate the observations and actions qf1_h = tf.concat([obs, action], axis=-1) # create the hidden layers for i, layer_size in enumerate(self.layers): qf1_h = layer( qf1_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm ) # create the output layer qf1 = layer( qf1_h, 1, 'qf_output', kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3) ) with tf.compat.v1.variable_scope('qf2', reuse=reuse): # concatenate the observations and actions qf2_h = tf.concat([obs, action], axis=-1) # create the hidden layers for i, layer_size in enumerate(self.layers): qf2_h = layer( qf2_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm ) # create the output layer qf2 = layer( qf2_h, 1, 'qf_output', kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3) ) else: qf1, qf2 = None, None return qf1, qf2, value_fn
def make_actor(self, obs, action, reuse=False, scope="pi"): """Create the actor variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the deterministic actor tf.Variable the output from the stochastic actor tf.Variable the log-probability of a given observation given the output action from the policy tf.Variable the log-probability of a given observation given a fixed action """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: pi_h = self._remove_fingerprint( pi_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0] ) # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer( pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm ) # create the output mean policy_mean = layer( pi_h, self.ac_space.shape[0], 'mean', act_fun=None, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3) ) # create the output log_std log_std = layer( pi_h, self.ac_space.shape[0], 'log_std', act_fun=None, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func( policy_mean, action, logp_ac) deterministic_policy, policy, logp_pi = apply_squashing_func( policy_mean, policy, logp_pi) return deterministic_policy, policy, logp_pi, logp_ac
def test_layer(self): """Check the functionality of the layer() method. This method is tested for the following features: 1. the number of outputs from the layer equals num_outputs 2. the name is properly used 3. the proper activation function applied if requested 4. layer_norm is applied if requested """ # =================================================================== # # test case 1 # # =================================================================== # # Choose a random number of outputs. num_outputs = random.randint(1, 10) # Create the layer. out_val = layer( val=tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='input_test1', ), num_outputs=num_outputs, name="test1", ) # Test the number of outputs. self.assertEqual(out_val.shape[-1], num_outputs) # Clear the graph. tf.compat.v1.reset_default_graph() # =================================================================== # # test case 2 # # =================================================================== # # Create the layer. out_val = layer( val=tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='input_test2', ), num_outputs=num_outputs, name="test2", ) # Test the name matches what is expected. self.assertEqual(out_val.name, "test2/BiasAdd:0") # Clear the graph. tf.compat.v1.reset_default_graph() # =================================================================== # # test case 3 # # =================================================================== # # Create the layer. out_val = layer( val=tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='input_test3', ), act_fun=tf.nn.relu, num_outputs=num_outputs, name="test3", ) # Test that the name matches the activation function that was added. self.assertEqual(out_val.name, "Relu:0") # Clear the graph. tf.compat.v1.reset_default_graph() # =================================================================== # # test case 4 # # =================================================================== # # Create the layer. _ = layer( val=tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='input_test4', ), layer_norm=True, num_outputs=num_outputs, name="test4", ) # Test that the LayerNorm layer was added. self.assertListEqual( sorted([var.name for var in get_trainable_vars()]), ['LayerNorm/beta:0', 'LayerNorm/gamma:0', 'test4/bias:0', 'test4/kernel:0'] ) # Clear the graph. tf.compat.v1.reset_default_graph()