def create_sac_value_head( self, stream_names, hidden_input, num_layers, h_size, scope ): """ Creates one value estimator head for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for value network :param h_size: size of hidden layers for value network :param scope: TF scope for value network. """ with tf.variable_scope(scope): value_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "encoder", False ) if self.use_recurrent: value_hidden, memory_out = ModelUtils.create_recurrent_encoder( value_hidden, self.value_memory_in, self.sequence_length_ph, name="lstm_value", ) self.value_memory_out = memory_out self.create_value_heads(stream_names, value_hidden)
def _create_dc_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Discrete control critic (value) network. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: The type of visual encoder to use. """ hidden_stream = ModelUtils.create_observation_streams( self.policy.visual_in, self.policy.processed_vector_in, 1, h_size, num_layers, vis_encode_type, )[0] if self.policy.use_recurrent: hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder( hidden_stream, self.memory_in, self.policy.sequence_length_ph, name="lstm_value", ) self.memory_out = memory_value_out else: hidden_value = hidden_stream self.value_heads, self.value = ModelUtils.create_value_heads( self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.policy.act_size)], dtype=tf.float32, name="old_probabilities", ) _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer( self.all_old_log_probs, self.policy.action_masks, self.policy.act_size) action_idx = [0] + list(np.cumsum(self.policy.act_size)) self.old_log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.policy. selected_actions[:, action_idx[i]:action_idx[i + 1]], logits=old_normalized_logits[:, action_idx[i]: action_idx[i + 1]], ) for i in range(len(self.policy.act_size)) ], axis=1, )), axis=1, keepdims=True, )
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): distribution = GaussianDistribution( hidden_policy, self.act_size, reparameterize=reparameterize, tanh_squash=tanh_squash, condition_sigma=condition_sigma_on_obs, ) if tanh_squash: self.output_pre = distribution.sample self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = distribution.sample # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs") self.entropy = distribution.entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.total_log_probs = distribution.total_log_probs
def _create_dc_actor(self, encoded: tf.Tensor) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. """ if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([encoded, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( hidden_policy, self.memory_in, self.sequence_length_ph, name="lstm_policy", ) self.memory_out = tf.identity(memory_policy_out, "recurrent_out") else: hidden_policy = encoded self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") with tf.variable_scope("policy"): distribution = MultiCategoricalDistribution( hidden_policy, self.act_size, self.action_masks) # It's important that we are able to feed_dict a value into this tensor to get the # right one-hot encoding, so we can't do identity on it. self.output = distribution.sample self.all_log_probs = tf.identity(distribution.log_probs, name="action") self.selected_actions = tf.stop_gradient( distribution.sample_onehot) # In discrete, these are onehot self.entropy = distribution.entropy self.total_log_probs = distribution.total_log_probs
def _create_cc_critic( self, h_size: int, num_layers: int, vis_encode_type: EncoderType ) -> None: """ Creates Continuous control critic (value) network. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: The type of visual encoder to use. """ hidden_stream = ModelUtils.create_observation_streams( self.policy.visual_in, self.policy.processed_vector_in, 1, h_size, num_layers, vis_encode_type, )[0] if self.policy.use_recurrent: hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder( hidden_stream, self.memory_in, self.policy.sequence_length_ph, name="lstm_value", ) self.memory_out = memory_value_out else: hidden_value = hidden_stream self.value_heads, self.value = ModelUtils.create_value_heads( self.stream_names, hidden_value ) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.policy.act_size)], dtype=tf.float32, name="old_probabilities", ) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True )
def create_q_heads( self, stream_names, hidden_input, num_layers, h_size, scope, reuse=False, num_outputs=1, ): """ Creates two q heads for each reward signal in stream_names. Also creates the node corresponding to the mean of all the value heads in self.value. self.value_head is a dictionary of stream name to node containing the value estimator head for that signal. :param stream_names: The list of reward signal names :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top of the hidden input. :param num_layers: Number of hidden layers for Q network :param h_size: size of hidden layers for Q network :param scope: TF scope for Q network. :param reuse: Whether or not to reuse variables. Useful for creating Q of policy. :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions. """ with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse): q1_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse ) if self.use_recurrent: q1_hidden, memory_out = ModelUtils.create_recurrent_encoder( q1_hidden, self.q1_memory_in, self.sequence_length_ph, name="lstm_q1", ) self.q1_memory_out = memory_out q1_heads = {} for name in stream_names: _q1 = tf.layers.dense(q1_hidden, num_outputs, name="{}_q1".format(name)) q1_heads[name] = _q1 q1 = tf.reduce_mean(list(q1_heads.values()), axis=0) with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse): q2_hidden = ModelUtils.create_vector_observation_encoder( hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse ) if self.use_recurrent: q2_hidden, memory_out = ModelUtils.create_recurrent_encoder( q2_hidden, self.q2_memory_in, self.sequence_length_ph, name="lstm_q2", ) self.q2_memory_out = memory_out q2_heads = {} for name in stream_names: _q2 = tf.layers.dense(q2_hidden, num_outputs, name="{}_q2".format(name)) q2_heads[name] = _q2 q2 = tf.reduce_mean(list(q2_heads.values()), axis=0) return q1_heads, q2_heads, q1, q2
def _create_dc_actor(self, encoded: tf.Tensor) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. """ if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([encoded, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( hidden_policy, self.memory_in, self.sequence_length_ph, name="lstm_policy", ) self.memory_out = tf.identity(memory_policy_out, "recurrent_out") else: hidden_policy = encoded policy_branches = [] with tf.variable_scope("policy"): for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_policy, size, activation=None, use_bias=False, kernel_initializer=ModelUtils.scaled_init(0.01), )) raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer( raw_log_probs, self.action_masks, self.act_size) self.output = tf.identity(output) self.all_log_probs = tf.identity(normalized_logits, name="action") self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.action_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.selected_actions = tf.stop_gradient(self.action_oh) action_idx = [0] + list(np.cumsum(self.act_size)) self.entropy = tf.reduce_sum( (tf.stack( [ tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.nn.softmax( self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]), logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, ) self.log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, )
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=ModelUtils.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) # Policy-dependent log_sigma if condition_sigma_on_obs: log_sigma = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_sigma", kernel_initializer=ModelUtils.scaled_init(0.01), ) else: log_sigma = tf.get_variable( "log_sigma", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) log_sigma = tf.clip_by_value(log_sigma, self.log_std_min, self.log_std_max) sigma = tf.exp(log_sigma) epsilon = tf.random_normal(tf.shape(mu)) sampled_policy = mu + sigma * epsilon # Stop gradient if we're not doing the resampling trick if not reparameterize: sampled_policy_probs = tf.stop_gradient(sampled_policy) else: sampled_policy_probs = sampled_policy # Compute probability of model output. _gauss_pre = -0.5 * ( ((sampled_policy_probs - mu) / (sigma + EPSILON))**2 + 2 * log_sigma + np.log(2 * np.pi)) all_probs = _gauss_pre all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) if tanh_squash: self.output_pre = tf.tanh(sampled_policy) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = sampled_policy # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(all_probs, name="action_probs") single_dim_entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + 2 * log_sigma) # Make entropy the right shape self.entropy = tf.ones_like(tf.reshape(mu[:, 0], [-1])) * single_dim_entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder")