def _setup_stochastic_policy(self, obs, action, reuse=False, scope="pi"): """Create the variables of a stochastic policy. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the policy """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer(pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm) # create the output mean policy_mean = layer( pi_h, self.ac_space.shape[0], 'mean', act_fun=None, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) # create the output log_std log_std = layer( pi_h, self.ac_space.shape[0], 'log_std', act_fun=None, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) _, policy, _ = apply_squashing_func(policy_mean, policy, logp_pi) # Store the variables under their respective parameters. self.policy = policy self.logp_ac = logp_ac
def test_apply_squashing(self): """Check the functionality of the apply_squashing() method.""" # Some inputs mu_ = tf.constant([[0, 0.5, 1, 2]], dtype=tf.float32) pi_ = tf.constant([[0, 0.5, 1, 2]], dtype=tf.float32) logp_pi = tf.constant([[0, 0.5, 1, 2]], dtype=tf.float32) # Run the function. det_policy, policy, logp_pi = apply_squashing_func(mu_, pi_, logp_pi) # Initialize everything. sess = tf.compat.v1.Session() sess.run(tf.compat.v1.global_variables_initializer()) # Test the output from the deterministic squashed output. np.testing.assert_almost_equal( sess.run(det_policy), [[0., 0.4621172, 0.7615942, 0.9640276]]) # Clear the graph. tf.compat.v1.reset_default_graph()
def make_actor(self, obs, action, reuse=False, scope="pi"): """Create the actor variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the deterministic actor tf.Variable the output from the stochastic actor tf.Variable the log-probability of a given observation given the output action from the policy tf.Variable the log-probability of a given observation given a fixed action """ # Initial image pre-processing (for convolutional policies). if self.model_params["model_type"] == "conv": pi_h = create_conv( obs=obs, image_height=self.model_params["image_height"], image_width=self.model_params["image_width"], image_channels=self.model_params["image_channels"], ignore_flat_channels=self.model_params["ignore_flat_channels"], ignore_image=self.model_params["ignore_image"], filters=self.model_params["filters"], kernel_sizes=self.model_params["kernel_sizes"], strides=self.model_params["strides"], act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) else: pi_h = obs # Create the model. policy_mean, log_std = create_fcnet( obs=pi_h, layers=self.model_params["layers"], num_output=self.ac_space.shape[0], stochastic=True, act_fun=self.model_params["act_fun"], layer_norm=self.model_params["layer_norm"], scope=scope, reuse=reuse, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func(policy_mean, action, logp_ac) deterministic_policy, policy, logp_pi = apply_squashing_func( policy_mean, policy, logp_pi) return deterministic_policy, policy, logp_pi, logp_ac
def make_actor(self, obs, action, reuse=False, scope="pi"): """Create the actor variables. Parameters ---------- obs : tf.compat.v1.placeholder the input observation placeholder action : tf.compat.v1.placeholder the input action placeholder reuse : bool whether or not to reuse parameters scope : str the scope name of the actor Returns ------- tf.Variable the output from the deterministic actor tf.Variable the output from the stochastic actor tf.Variable the log-probability of a given observation given the output action from the policy tf.Variable the log-probability of a given observation given a fixed action """ with tf.compat.v1.variable_scope(scope, reuse=reuse): pi_h = obs # zero out the fingerprint observations for the worker policy if self.zero_fingerprint: pi_h = self._remove_fingerprint( pi_h, self.ob_space.shape[0], self.fingerprint_dim, self.co_space.shape[0] ) # create the hidden layers for i, layer_size in enumerate(self.layers): pi_h = layer( pi_h, layer_size, 'fc{}'.format(i), act_fun=self.act_fun, layer_norm=self.layer_norm ) # create the output mean policy_mean = layer( pi_h, self.ac_space.shape[0], 'mean', act_fun=None, kernel_initializer=tf.random_uniform_initializer( minval=-3e-3, maxval=3e-3) ) # create the output log_std log_std = layer( pi_h, self.ac_space.shape[0], 'log_std', act_fun=None, ) # OpenAI Variation to cap the standard deviation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) # Reparameterization trick policy = policy_mean + tf.random.normal(tf.shape(policy_mean)) * std logp_pi = gaussian_likelihood(policy, policy_mean, log_std) logp_ac = gaussian_likelihood(action, policy_mean, log_std) # Apply squashing and account for it in the probability _, _, logp_ac = apply_squashing_func( policy_mean, action, logp_ac) deterministic_policy, policy, logp_pi = apply_squashing_func( policy_mean, policy, logp_pi) return deterministic_policy, policy, logp_pi, logp_ac