def test_tanh_distribution(): with tf.Graph().as_default(): logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32) distribution = GaussianDistribution(logits, act_size=VECTOR_ACTION_SPACE, reparameterize=False, tanh_squash=True) sess = tf.Session() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) output = sess.run(distribution.sample) for _ in range(10): output = sess.run( [distribution.sample, distribution.log_probs]) for out in output: assert out.shape[1] == VECTOR_ACTION_SPACE[0] # Assert action never exceeds [-1,1] action = output[0][0] for act in action: assert act >= -1 and act <= 1 output = sess.run([distribution.total_log_probs]) assert output[0].shape[0] == 1
def test_gaussian_distribution(): with tf.Graph().as_default(): logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32) distribution = GaussianDistribution( logits, act_size=VECTOR_ACTION_SPACE, reparameterize=False, tanh_squash=False, ) sess = tf.Session() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) output = sess.run(distribution.sample) for _ in range(10): output = sess.run( [distribution.sample, distribution.log_probs]) for out in output: assert out.shape[1] == VECTOR_ACTION_SPACE[0] output = sess.run([distribution.total_log_probs]) assert output[0].shape[0] == 1 # Test entropy is correct log_std_tensor = tf.get_default_graph().get_tensor_by_name( "log_std/BiasAdd:0") feed_dict = {log_std_tensor: [[1.0, 1.0]]} entropy = sess.run([distribution.entropy], feed_dict=feed_dict) # Entropy with log_std of 1.0 should be 2.42 assert pytest.approx(entropy[0], 0.01) == 2.42
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): distribution = GaussianDistribution( hidden_policy, self.act_size, reparameterize=reparameterize, tanh_squash=tanh_squash, condition_sigma=condition_sigma_on_obs, ) if tanh_squash: self.output_pre = distribution.sample self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = distribution.sample # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs") self.entropy = distribution.entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.total_log_probs = distribution.total_log_probs