def __init__( self, m_size, normalize, use_recurrent, brain, seed, stream_names=None ): tf.set_random_seed(seed) self.brain = brain self.vector_in = None self.global_step, self.increment_step, self.steps_to_increment = ( self.create_global_steps() ) self.visual_in = [] self.batch_size = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length = tf.placeholder( shape=None, dtype=tf.int32, name="sequence_length" ) self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") self.mask = tf.cast(self.mask_input, tf.int32) self.stream_names = stream_names or [] self.use_recurrent = use_recurrent if self.use_recurrent: self.m_size = m_size else: self.m_size = 0 self.normalize = normalize self.act_size = brain.vector_action_space_size self.vec_obs_size = brain.vector_observation_space_size self.vis_obs_size = brain.number_visual_observations tf.Variable( int(brain.vector_action_space_type == "continuous"), name="is_continuous_control", trainable=False, dtype=tf.int32, ) tf.Variable( self._version_number_, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) if brain.vector_action_space_type == "continuous": tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, ) self.value_heads: Dict[str, tf.Tensor] = {} self.normalization_steps: Optional[tf.Variable] = None self.running_mean: Optional[tf.Variable] = None self.running_variance: Optional[tf.Variable] = None self.update_normalization: Optional[tf.Operation] = None self.value: Optional[tf.Tensor] = None
def _create_dc_actor(self, encoded: tf.Tensor) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. """ if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([encoded, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( hidden_policy, self.memory_in, self.sequence_length_ph, name="lstm_policy", ) self.memory_out = tf.identity(memory_policy_out, "recurrent_out") else: hidden_policy = encoded self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") with tf.variable_scope("policy"): distribution = MultiCategoricalDistribution( hidden_policy, self.act_size, self.action_masks) # It's important that we are able to feed_dict a value into this tensor to get the # right one-hot encoding, so we can't do identity on it. self.output = distribution.sample self.all_log_probs = tf.identity(distribution.log_probs, name="action") self.selected_actions = tf.stop_gradient( distribution.sample_onehot) # In discrete, these are onehot self.entropy = distribution.entropy self.total_log_probs = distribution.total_log_probs
def create_input_placeholders( observation_shapes: List[Tuple], name_prefix: str = "") -> Tuple[tf.Tensor, List[tf.Tensor]]: """ Creates input placeholders for visual inputs. :param observation_shapes: A List of tuples that specify the resolutions of the input observations. Tuples for now are restricted to 1D (vector) or 3D (Tensor) :param name_prefix: A name prefix to add to the placeholder names. This is used so that there is no conflict when creating multiple placeholder sets. :returns: A List of Tensorflow placeholders where the input iamges should be fed. """ visual_in: List[tf.Tensor] = [] vector_in_size = 0 for i, dimension in enumerate(observation_shapes): if len(dimension) == 3: _res = Tensor3DShape(height=dimension[0], width=dimension[1], num_channels=dimension[2]) visual_input = ModelUtils.create_visual_input( _res, name=name_prefix + "visual_observation_" + str(i)) visual_in.append(visual_input) elif len(dimension) == 1: vector_in_size += dimension[0] else: raise UnityTrainerException( f"Unsupported shape of {dimension} for observation {i}") vector_in = tf.placeholder( shape=[None, vector_in_size], dtype=tf.float32, name=name_prefix + "vector_observation", ) return vector_in, visual_in
def create_network(self) -> None: """ Helper for creating the intrinsic reward nodes """ if self.use_vail: self.z_sigma = tf.get_variable( "gail_sigma_vail", self.z_size, dtype=tf.float32, initializer=tf.ones_initializer(), ) self.z_sigma_sq = self.z_sigma * self.z_sigma self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON) self.use_noise = tf.placeholder( shape=[1], dtype=tf.float32, name="gail_NoiseLevel" ) self.expert_estimate, self.z_mean_expert, _ = self.create_encoder( self.encoded_expert, self.expert_action, self.done_expert, reuse=False ) self.policy_estimate, self.z_mean_policy, _ = self.create_encoder( self.encoded_policy, self.policy.selected_actions, self.done_policy, reuse=True, ) self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate) self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate) self.discriminator_score = tf.reshape( self.policy_estimate, [-1], name="gail_reward" ) self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
def _create_inputs_and_outputs(self) -> None: """ Assign the higher-level SACModel's inputs and outputs to those of its policy or target network. """ self.vector_in = self.policy.vector_in self.visual_in = self.policy.visual_in self.next_vector_in = self.target_network.vector_in self.next_visual_in = self.target_network.visual_in self.sequence_length_ph = self.policy.sequence_length_ph self.next_sequence_length_ph = self.target_network.sequence_length_ph if not self.policy.use_continuous_act: self.action_masks = self.policy_network.action_masks else: self.output_pre = self.policy_network.output_pre # Don't use value estimate during inference. self.value = tf.identity(self.policy_network.value, name="value_estimate_unused") self.value_heads = self.policy_network.value_heads self.dones_holder = tf.placeholder(shape=[None], dtype=tf.float32, name="dones_holder") if self.policy.use_recurrent: self.memory_in = self.policy_network.memory_in self.memory_out = self.policy_network.memory_out if not self.policy.use_continuous_act: self.prev_action = self.policy_network.prev_action self.next_memory_in = self.target_network.memory_in
def init_load_weights(self): with self.graph.as_default(): _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) values = [v.eval(session=self.sess) for v in _vars] for var, value in zip(_vars, values): assign_ph = tf.placeholder(var.dtype, shape=value.shape) self.assign_phs.append(assign_ph) self.assign_ops.append(tf.assign(var, assign_ph))
def _create_dc_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Discrete control critic (value) network. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: The type of visual encoder to use. """ hidden_stream = ModelUtils.create_observation_streams( self.policy.visual_in, self.policy.processed_vector_in, 1, h_size, num_layers, vis_encode_type, )[0] if self.policy.use_recurrent: hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder( hidden_stream, self.memory_in, self.policy.sequence_length_ph, name="lstm_value", ) self.memory_out = memory_value_out else: hidden_value = hidden_stream self.value_heads, self.value = ModelUtils.create_value_heads( self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.policy.act_size)], dtype=tf.float32, name="old_probabilities", ) _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer( self.all_old_log_probs, self.policy.action_masks, self.policy.act_size) action_idx = [0] + list(np.cumsum(self.policy.act_size)) self.old_log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.policy. selected_actions[:, action_idx[i]:action_idx[i + 1]], logits=old_normalized_logits[:, action_idx[i]: action_idx[i + 1]], ) for i in range(len(self.policy.act_size)) ], axis=1, )), axis=1, keepdims=True, )
def create_inputs_and_outputs(self): """ Assign the higher-level SACModel's inputs and outputs to those of its policy or target network. """ self.vector_in = self.policy_network.vector_in self.visual_in = self.policy_network.visual_in self.next_vector_in = self.target_network.vector_in self.next_visual_in = self.target_network.visual_in self.action_holder = self.policy_network.action_holder self.sequence_length = self.policy_network.sequence_length self.next_sequence_length = self.target_network.sequence_length if self.brain.vector_action_space_type == "discrete": self.action_masks = self.policy_network.action_masks else: self.output_pre = self.policy_network.output_pre self.output = self.policy_network.output # Don't use value estimate during inference. TODO: Check why PPO uses value_estimate in inference. self.value = tf.identity( self.policy_network.value, name="value_estimate_unused" ) self.value_heads = self.policy_network.value_heads self.all_log_probs = self.policy_network.all_log_probs self.dones_holder = tf.placeholder( shape=[None], dtype=tf.float32, name="dones_holder" ) # This is just a dummy to get pretraining to work. PPO has this but SAC doesn't. # TODO: Proper input and output specs for models self.epsilon = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon" ) if self.use_recurrent: self.memory_in = self.policy_network.memory_in self.memory_out = self.policy_network.memory_out # For Barracuda self.inference_memory_out = tf.identity( self.policy_network.policy_memory_out, name="recurrent_out" ) if self.brain.vector_action_space_type == "discrete": self.prev_action = self.policy_network.prev_action self.next_memory_in = self.target_network.memory_in
def __init__( self, policy=None, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, vis_encode_type=EncoderType.SIMPLE, ): self.normalize = normalize self.use_recurrent = use_recurrent self.num_layers = num_layers self.stream_names = stream_names self.h_size = h_size self.activ_fn = ModelUtils.swish self.sequence_length_ph = tf.placeholder( shape=None, dtype=tf.int32, name="sac_sequence_length" ) self.policy_memory_in: Optional[tf.Tensor] = None self.policy_memory_out: Optional[tf.Tensor] = None self.value_memory_in: Optional[tf.Tensor] = None self.value_memory_out: Optional[tf.Tensor] = None self.q1: Optional[tf.Tensor] = None self.q2: Optional[tf.Tensor] = None self.q1_p: Optional[tf.Tensor] = None self.q2_p: Optional[tf.Tensor] = None self.q1_memory_in: Optional[tf.Tensor] = None self.q2_memory_in: Optional[tf.Tensor] = None self.q1_memory_out: Optional[tf.Tensor] = None self.q2_memory_out: Optional[tf.Tensor] = None self.prev_action: Optional[tf.Tensor] = None self.action_masks: Optional[tf.Tensor] = None self.external_action_in: Optional[tf.Tensor] = None self.log_sigma_sq: Optional[tf.Tensor] = None self.entropy: Optional[tf.Tensor] = None self.deterministic_output: Optional[tf.Tensor] = None self.normalized_logprobs: Optional[tf.Tensor] = None self.action_probs: Optional[tf.Tensor] = None self.output_oh: Optional[tf.Tensor] = None self.output_pre: Optional[tf.Tensor] = None self.value_vars = None self.q_vars = None self.critic_vars = None self.policy_vars = None self.q1_heads: Dict[str, tf.Tensor] = None self.q2_heads: Dict[str, tf.Tensor] = None self.q1_pheads: Dict[str, tf.Tensor] = None self.q2_pheads: Dict[str, tf.Tensor] = None self.policy = policy
def _create_cc_actor( self, encoded: tf.Tensor, tanh_squash: bool = False, reparameterize: bool = False, condition_sigma_on_obs: bool = True, ) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. :param tanh_squash: Whether to use a tanh function, or a clipped output. :param reparameterize: Whether we are using the resampling trick to update the policy. """ if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy") self.memory_out = tf.identity(memory_policy_out, name="recurrent_out") else: hidden_policy = encoded with tf.variable_scope("policy"): distribution = GaussianDistribution( hidden_policy, self.act_size, reparameterize=reparameterize, tanh_squash=tanh_squash, condition_sigma=condition_sigma_on_obs, ) if tanh_squash: self.output_pre = distribution.sample self.output = tf.identity(self.output_pre, name="action") else: self.output_pre = distribution.sample # Clip and scale output to ensure actions are always within [-1, 1] range. output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(self.output) self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs") self.entropy = distribution.entropy # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.total_log_probs = distribution.total_log_probs
def create_global_steps(): """Creates TF ops to track and increment global training step.""" global_step = tf.Variable( 0, name="global_step", trainable=False, dtype=tf.int32 ) steps_to_increment = tf.placeholder( shape=[], dtype=tf.int32, name="steps_to_increment" ) increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment)) return global_step, increment_step, steps_to_increment
def create_vector_input(vec_obs_size: int, name: str = "vector_observation") -> tf.Tensor: """ Creates ops for vector observation input. :param vec_obs_size: Size of stacked vector observation. :param name: Name of the placeholder op. :return: Placeholder for vector observations. """ vector_in = tf.placeholder(shape=[None, vec_obs_size], dtype=tf.float32, name=name) return vector_in
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) if self.policy.behavior_spec.action_spec.is_continuous(): action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder(shape=[None, action_length], dtype=tf.float32) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder(shape=[None, action_length], dtype=tf.int32) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, )
def create_vector_input(self, name="vector_observation"): """ Creates ops for vector observation input. :param name: Name of the placeholder op. :param vec_obs_size: Size of stacked vector observation. :return: """ self.vector_in = tf.placeholder(shape=[None, self.vec_obs_size], dtype=tf.float32, name=name) if self.normalize: self.create_normalizer(self.vector_in) return self.normalize_vector_obs(self.vector_in) else: return self.vector_in
def create_visual_input(camera_parameters: Tensor3DShape, name: str) -> tf.Tensor: """ Creates image input op. :param camera_parameters: Parameters for visual observation. :param name: Desired name of input op. :return: input op. """ o_size_h = camera_parameters.height o_size_w = camera_parameters.width c_channels = camera_parameters.num_channels visual_in = tf.placeholder( shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name ) return visual_in
def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( brain, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, seed, vis_encode_type, ) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") self.value_memory_in = self.memory_in with tf.variable_scope(TARGET_SCOPE): hidden_streams = self.create_observation_streams( 1, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["critic/value/"], ) if brain.vector_action_space_type == "continuous": self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) else: self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) if self.use_recurrent: self.memory_out = tf.concat(self.value_memory_out, axis=1) # Needed for Barracuda to work
def _create_memory_ins(self, m_size): """ Creates the memory input placeholders for LSTM. :param m_size: the total size of the memory. """ self.memory_in = tf.placeholder( shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in" ) # Re-break-up for each network num_mems = 3 input_size = self.memory_in.get_shape().as_list()[1] mem_ins = [] for i in range(num_mems): _start = input_size // num_mems * i _end = input_size // num_mems * (i + 1) mem_ins.append(self.memory_in[:, _start:_end]) self.value_memory_in = mem_ins[0] self.q1_memory_in = mem_ins[1] self.q2_memory_in = mem_ins[2]
def _create_cc_critic( self, h_size: int, num_layers: int, vis_encode_type: EncoderType ) -> None: """ Creates Continuous control critic (value) network. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: The type of visual encoder to use. """ hidden_stream = ModelUtils.create_observation_streams( self.policy.visual_in, self.policy.processed_vector_in, 1, h_size, num_layers, vis_encode_type, )[0] if self.policy.use_recurrent: hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder( hidden_stream, self.memory_in, self.policy.sequence_length_ph, name="lstm_value", ) self.memory_out = memory_value_out else: hidden_value = hidden_stream self.value_heads, self.value = ModelUtils.create_value_heads( self.stream_names, hidden_value ) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.policy.act_size)], dtype=tf.float32, name="old_probabilities", ) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True )
def create_memory_ins(self, m_size): """ Creates the memory input placeholders for LSTM. :param m_size: the total size of the memory. """ # Create the Policy input separate from the rest # This is so in inference we only have to run the Policy network. # Barracuda will grab the recurrent_in and recurrent_out named tensors. self.inference_memory_in = tf.placeholder( shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in" ) # We assume m_size is divisible by 4 # Create the non-Policy inputs # Use a default placeholder here so nothing has to be provided during # Barracuda inference. Note that the default value is just the tiled input # for the policy, which is thrown away. three_fourths_m_size = m_size * 3 // 4 self.other_memory_in = tf.placeholder_with_default( input=tf.tile(self.inference_memory_in, [1, 3]), shape=[None, three_fourths_m_size], name="other_recurrent_in", ) # Concat and use this as the "placeholder" # for training self.memory_in = tf.concat( [self.other_memory_in, self.inference_memory_in], axis=1 ) # Re-break-up for each network num_mems = 4 mem_ins = [] for i in range(num_mems): _start = m_size // num_mems * i _end = m_size // num_mems * (i + 1) mem_ins.append(self.memory_in[:, _start:_end]) self.value_memory_in = mem_ins[0] self.q1_memory_in = mem_ins[1] self.q2_memory_in = mem_ins[2] self.policy_memory_in = mem_ins[3]
def _create_cc_critic(self, hidden_value, scope, create_qs=True): """ Creates just the critic network """ scope = self.join_scopes(scope, "critic") self.create_sac_value_head( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "value"), ) self.external_action_in = tf.placeholder( shape=[None, self.policy.act_size[0]], dtype=tf.float32, name="external_action_in", ) self.value_vars = self.get_vars(self.join_scopes(scope, "value")) if create_qs: hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1) hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1) self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( self.stream_names, hidden_q, self.num_layers, self.h_size, self.join_scopes(scope, "q"), ) self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( self.stream_names, hidden_qp, self.num_layers, self.h_size, self.join_scopes(scope, "q"), reuse=True, ) self.q_vars = self.get_vars(self.join_scopes(scope, "q")) self.critic_vars = self.get_vars(scope)
def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) hyperparameters: PPOSettings = cast( PPOSettings, trainer_params.hyperparameters) lr = float(hyperparameters.learning_rate) self._schedule = hyperparameters.learning_rate_schedule epsilon = float(hyperparameters.epsilon) beta = float(hyperparameters.beta) max_step = float(trainer_params.max_steps) policy_network_settings = policy.network_settings h_size = int(policy_network_settings.hidden_units) num_layers = policy_network_settings.num_layers vis_encode_type = policy_network_settings.vis_encode_type self.burn_in_ratio = 0.0 self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer_op: Optional[tf.train.Optimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", "Policy/Epsilon": "decay_epsilon", "Policy/Beta": "decay_beta", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_schedule( self._schedule, lr, self.policy.global_step, int(max_step), min_value=1e-10, ) self._create_losses( self.policy.total_log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, "decay_epsilon": self.decay_epsilon, "decay_beta": self.decay_beta, })
def create_input_placeholders(self): with self.graph.as_default(): ( self.global_step, self.increment_step_op, self.steps_to_increment, ) = ModelUtils.create_global_steps() self.vector_in, self.visual_in = ModelUtils.create_input_placeholders( self.behavior_spec.observation_shapes) if self.normalize: self.first_normalization_update = True normalization_tensors = ModelUtils.create_normalizer( self.vector_in) self.update_normalization_op = normalization_tensors.update_op self.init_normalization_op = normalization_tensors.init_op self.normalization_steps = normalization_tensors.steps self.running_mean = normalization_tensors.running_mean self.running_variance = normalization_tensors.running_variance self.processed_vector_in = ModelUtils.normalize_vector_obs( self.vector_in, self.running_mean, self.running_variance, self.normalization_steps, ) else: self.processed_vector_in = self.vector_in self.update_normalization_op = None self.batch_size_ph = tf.placeholder(shape=None, dtype=tf.int32, name="batch_size") self.sequence_length_ph = tf.placeholder(shape=None, dtype=tf.int32, name="sequence_length") self.mask_input = tf.placeholder(shape=[None], dtype=tf.float32, name="masks") # Only needed for PPO, but needed for BC module self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") self.mask = tf.cast(self.mask_input, tf.int32) tf.Variable( int(self.behavior_spec.is_action_continuous()), name="is_continuous_control", trainable=False, dtype=tf.int32, ) int_version = TFPolicy._convert_version_string(__version__) major_ver_t = tf.Variable( int_version[0], name="trainer_major_version", trainable=False, dtype=tf.int32, ) minor_ver_t = tf.Variable( int_version[1], name="trainer_minor_version", trainable=False, dtype=tf.int32, ) patch_ver_t = tf.Variable( int_version[2], name="trainer_patch_version", trainable=False, dtype=tf.int32, ) self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t) tf.Variable( MODEL_FORMAT_VERSION, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable(self.m_size, name="memory_size", trainable=False, dtype=tf.int32) if self.behavior_spec.is_action_continuous(): tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, )
def _create_losses( self, q1_streams: Dict[str, tf.Tensor], q2_streams: Dict[str, tf.Tensor], lr: tf.Tensor, max_step: int, stream_names: List[str], discrete: bool = False, ) -> None: """ Creates training-specific Tensorflow ops for SAC models. :param q1_streams: Q1 streams from policy network :param q1_streams: Q2 streams from policy network :param lr: Learning rate :param max_step: Total number of training steps. :param stream_names: List of reward stream names. :param discrete: Whether or not to use discrete action losses. """ if discrete: self.target_entropy = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self.act_size ] discrete_action_probs = tf.exp(self.policy.all_log_probs) per_action_entropy = discrete_action_probs * self.policy.all_log_probs else: self.target_entropy = ( -1 * self.continuous_target_entropy_scale * np.prod(self.act_size[0]).astype(np.float32)) self.rewards_holders = {} self.min_policy_qs = {} for name in stream_names: if discrete: _branched_mpq1 = ModelUtils.break_into_branches( self.policy_network.q1_pheads[name] * discrete_action_probs, self.act_size, ) branched_mpq1 = tf.stack([ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq1 ]) _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0) _branched_mpq2 = ModelUtils.break_into_branches( self.policy_network.q2_pheads[name] * discrete_action_probs, self.act_size, ) branched_mpq2 = tf.stack([ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq2 ]) _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0) self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean) else: self.min_policy_qs[name] = tf.minimum( self.policy_network.q1_pheads[name], self.policy_network.q2_pheads[name], ) rewards_holder = tf.placeholder(shape=[None], dtype=tf.float32, name=f"{name}_rewards") self.rewards_holders[name] = rewards_holder q1_losses = [] q2_losses = [] # Multiple q losses per stream expanded_dones = tf.expand_dims(self.dones_holder, axis=-1) for i, name in enumerate(stream_names): _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1) q_backup = tf.stop_gradient( _expanded_rewards + (1.0 - self.use_dones_in_backup[name] * expanded_dones) * self.gammas[i] * self.target_network.value_heads[name]) if discrete: # We need to break up the Q functions by branch, and update them individually. branched_q1_stream = ModelUtils.break_into_branches( self.policy.selected_actions * q1_streams[name], self.act_size) branched_q2_stream = ModelUtils.break_into_branches( self.policy.selected_actions * q2_streams[name], self.act_size) # Reduce each branch into scalar branched_q1_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q1_stream ] branched_q2_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q2_stream ] q1_stream = tf.reduce_mean(branched_q1_stream, axis=0) q2_stream = tf.reduce_mean(branched_q2_stream, axis=0) else: q1_stream = q1_streams[name] q2_stream = q2_streams[name] _q1_loss = 0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference(q_backup, q1_stream)) _q2_loss = 0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference(q_backup, q2_stream)) q1_losses.append(_q1_loss) q2_losses.append(_q2_loss) self.q1_loss = tf.reduce_mean(q1_losses) self.q2_loss = tf.reduce_mean(q2_losses) # Learn entropy coefficient if discrete: # Create a log_ent_coef for each branch self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(np.float32), trainable=True, ) else: self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log(self.init_entcoef).astype(np.float32), trainable=True, ) self.ent_coef = tf.exp(self.log_ent_coef) if discrete: # We also have to do a different entropy and target_entropy per branch. branched_per_action_ent = ModelUtils.break_into_branches( per_action_entropy, self.act_size) branched_ent_sums = tf.stack( [ tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp, _te in zip(branched_per_action_ent, self.target_entropy) ], axis=1, ) self.entropy_loss = -tf.reduce_mean( tf.to_float(self.policy.mask) * tf.reduce_mean( self.log_ent_coef * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2), axis=1, )) # Same with policy loss, we have to do the loss per branch and average them, # so that larger branches don't get more weight. # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q branched_q_term = ModelUtils.break_into_branches( discrete_action_probs * self.policy_network.q1_p, self.act_size) branched_policy_loss = tf.stack([ tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True) for i, (_lp, _qt) in enumerate( zip(branched_per_action_ent, branched_q_term)) ]) self.policy_loss = tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)) # Do vbackup entropy bonus per branch as well. branched_ent_bonus = tf.stack([ tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True) for i, _lp in enumerate(branched_per_action_ent) ]) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_mean(branched_ent_bonus, axis=0)) value_losses.append(0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup))) else: self.entropy_loss = -tf.reduce_mean( self.log_ent_coef * tf.to_float(self.policy.mask) * tf.stop_gradient( tf.reduce_sum( self.policy.all_log_probs + self.target_entropy, axis=1, keep_dims=True, ))) batch_policy_loss = tf.reduce_mean( self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p, axis=1, ) self.policy_loss = tf.reduce_mean( tf.to_float(self.policy.mask) * batch_policy_loss) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_sum( self.ent_coef * self.policy.all_log_probs, axis=1)) value_losses.append(0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup))) self.value_loss = tf.reduce_mean(value_losses) self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss self.entropy = self.policy_network.entropy
def create_input_placeholders(self): with self.graph.as_default(): ( self.global_step, self.increment_step_op, self.steps_to_increment, ) = ModelUtils.create_global_steps() self.visual_in = ModelUtils.create_visual_input_placeholders( self.brain.camera_resolutions ) self.vector_in = ModelUtils.create_vector_input(self.vec_obs_size) if self.normalize: normalization_tensors = ModelUtils.create_normalizer(self.vector_in) self.update_normalization_op = normalization_tensors.update_op self.normalization_steps = normalization_tensors.steps self.running_mean = normalization_tensors.running_mean self.running_variance = normalization_tensors.running_variance self.processed_vector_in = ModelUtils.normalize_vector_obs( self.vector_in, self.running_mean, self.running_variance, self.normalization_steps, ) else: self.processed_vector_in = self.vector_in self.update_normalization_op = None self.batch_size_ph = tf.placeholder( shape=None, dtype=tf.int32, name="batch_size" ) self.sequence_length_ph = tf.placeholder( shape=None, dtype=tf.int32, name="sequence_length" ) self.mask_input = tf.placeholder( shape=[None], dtype=tf.float32, name="masks" ) # Only needed for PPO, but needed for BC module self.epsilon = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon" ) self.mask = tf.cast(self.mask_input, tf.int32) tf.Variable( int(self.brain.vector_action_space_type == "continuous"), name="is_continuous_control", trainable=False, dtype=tf.int32, ) tf.Variable( self._version_number_, name="version_number", trainable=False, dtype=tf.int32, ) tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) if self.brain.vector_action_space_type == "continuous": tf.Variable( self.act_size[0], name="action_output_shape", trainable=False, dtype=tf.int32, ) else: tf.Variable( sum(self.act_size), name="action_output_shape", trainable=False, dtype=tf.int32, )
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_expert = tf.expand_dims(self.done_expert_holder, -1) self.done_policy = tf.expand_dims(self.done_policy_holder, -1) if self.policy.brain.vector_action_space_type == "continuous": action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.float32 ) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.int32 ) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, ) encoded_policy_list = [] encoded_expert_list = [] if self.policy.vec_obs_size > 0: self.obs_in_expert = tf.placeholder( shape=[None, self.policy.vec_obs_size], dtype=tf.float32 ) if self.policy.normalize: encoded_expert_list.append( ModelUtils.normalize_vector_obs( self.obs_in_expert, self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) ) encoded_policy_list.append(self.policy.processed_vector_in) else: encoded_expert_list.append(self.obs_in_expert) encoded_policy_list.append(self.policy.vector_in) if self.policy.vis_obs_size > 0: self.expert_visual_in: List[tf.Tensor] = [] visual_policy_encoders = [] visual_expert_encoders = [] for i in range(self.policy.vis_obs_size): # Create input ops for next (t+1) visual observations. visual_input = ModelUtils.create_visual_input( self.policy.brain.camera_resolutions[i], name="gail_visual_observation_" + str(i), ) self.expert_visual_in.append(visual_input) encoded_policy_visual = ModelUtils.create_visual_observation_encoder( self.policy.visual_in[i], self.encoding_size, ModelUtils.swish, 1, "gail_stream_{}_visual_obs_encoder".format(i), False, ) encoded_expert_visual = ModelUtils.create_visual_observation_encoder( self.expert_visual_in[i], self.encoding_size, ModelUtils.swish, 1, "gail_stream_{}_visual_obs_encoder".format(i), True, ) visual_policy_encoders.append(encoded_policy_visual) visual_expert_encoders.append(encoded_expert_visual) hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) encoded_policy_list.append(hidden_policy_visual) encoded_expert_list.append(hidden_expert_visual) self.encoded_expert = tf.concat(encoded_expert_list, axis=1) self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
def create_dc_actor(self, hidden_policy, scope): """ Creates Discrete control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ scope = self.join_scopes(scope, "policy") # Create inputs outside of the scope self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1) hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_policy, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling( 0.01), )) all_logits = tf.concat(policy_branches, axis=1, name="action_probs") output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer( all_logits, self.action_masks, self.act_size) self.action_probs = normalized_probs # Really, this is entropy, but it has an analogous purpose to the log probs in the # continuous case. self.all_log_probs = self.action_probs * normalized_logprobs self.output = output # Create action input (discrete) self.action_holder = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.output_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # For Curiosity and GAIL to retrieve selected actions. We don't # need the mask at this point because it's already stored in the buffer. self.selected_actions = tf.stop_gradient(self.output_oh) self.external_action_in = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # This is total entropy over all branches self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1) # Extract the normalized logprobs for Barracuda self.normalized_logprobs = tf.identity(normalized_logprobs, name="action") # We kept the LSTMs at a different scope than the rest, so add them if they exist. self.policy_vars = self.get_vars(scope) if self.use_recurrent: self.policy_vars += self.get_vars("lstm")
def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): """ Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. The PPO optimizer has a value estimator and a loss function. :param policy: A TFPolicy object that will be updated by this PPO Optimizer. :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer. """ # Create the graph here to give more granular control of the TF graph to the Optimizer. policy.create_tf_graph() with policy.graph.as_default(): with tf.variable_scope("optimizer/"): super().__init__(policy, trainer_params) lr = float(trainer_params["learning_rate"]) lr_schedule = LearningRateSchedule( trainer_params.get("learning_rate_schedule", "linear")) h_size = int(trainer_params["hidden_units"]) epsilon = float(trainer_params["epsilon"]) beta = float(trainer_params["beta"]) max_step = float(trainer_params["max_steps"]) num_layers = int(trainer_params["num_layers"]) vis_encode_type = EncoderType( trainer_params.get("vis_encode_type", "simple")) self.burn_in_ratio = float( trainer_params.get("burn_in_ratio", 0.0)) self.stream_names = list(self.reward_signals.keys()) self.tf_optimizer: Optional[tf.train.AdamOptimizer] = None self.grads = None self.update_batch: Optional[tf.Operation] = None self.stats_name_to_update_name = { "Losses/Value Loss": "value_loss", "Losses/Policy Loss": "policy_loss", "Policy/Learning Rate": "learning_rate", } if self.policy.use_recurrent: self.m_size = self.policy.m_size self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_value_in", ) if num_layers < 1: num_layers = 1 if policy.use_continuous_act: self._create_cc_critic(h_size, num_layers, vis_encode_type) else: self._create_dc_critic(h_size, num_layers, vis_encode_type) self.learning_rate = ModelUtils.create_learning_rate( lr_schedule, lr, self.policy.global_step, int(max_step)) self._create_losses( self.policy.total_log_probs, self.old_log_probs, self.value_heads, self.policy.entropy, beta, epsilon, lr, max_step, ) self._create_ppo_optimizer_ops() self.update_dict.update({ "value_loss": self.value_loss, "policy_loss": self.abs_policy_loss, "update_batch": self.update_batch, "learning_rate": self.learning_rate, }) self.policy.initialize_or_load()
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_expert = tf.expand_dims(self.done_expert_holder, -1) self.done_policy = tf.expand_dims(self.done_policy_holder, -1) if self.policy.behavior_spec.is_action_continuous(): action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.float32 ) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.int32 ) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, ) encoded_policy_list = [] encoded_expert_list = [] ( self.obs_in_expert, self.expert_visual_in, ) = ModelUtils.create_input_placeholders( self.policy.behavior_spec.observation_shapes, "gail_" ) if self.policy.vec_obs_size > 0: if self.policy.normalize: encoded_expert_list.append( ModelUtils.normalize_vector_obs( self.obs_in_expert, self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) ) encoded_policy_list.append(self.policy.processed_vector_in) else: encoded_expert_list.append(self.obs_in_expert) encoded_policy_list.append(self.policy.vector_in) if self.expert_visual_in: visual_policy_encoders = [] visual_expert_encoders = [] for i, (vis_in, exp_vis_in) in enumerate( zip(self.policy.visual_in, self.expert_visual_in) ): encoded_policy_visual = ModelUtils.create_visual_observation_encoder( vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", False, ) encoded_expert_visual = ModelUtils.create_visual_observation_encoder( exp_vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", True, ) visual_policy_encoders.append(encoded_policy_visual) visual_expert_encoders.append(encoded_expert_visual) hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) encoded_policy_list.append(hidden_policy_visual) encoded_expert_list.append(hidden_expert_visual) self.encoded_expert = tf.concat(encoded_expert_list, axis=1) self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
def _create_losses(self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ self.returns_holders = {} self.old_values = {} for name in value_heads.keys(): returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_returns".format(name)) old_value = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)) self.returns_holders[name] = returns_holder self.old_values[name] = old_value self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="advantages") advantage = tf.expand_dims(self.advantage, -1) decay_epsilon = tf.train.polynomial_decay(epsilon, self.policy.global_step, max_step, 0.1, power=1.0) decay_beta = tf.train.polynomial_decay(beta, self.policy.global_step, max_step, 1e-5, power=1.0) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], -decay_epsilon, decay_epsilon, ) v_opt_a = tf.squared_difference(self.returns_holders[name], tf.reduce_sum(head, axis=1)) v_opt_b = tf.squared_difference(self.returns_holders[name], clipped_value_estimate) value_loss = tf.reduce_mean( tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[1]) value_losses.append(value_loss) self.value_loss = tf.reduce_mean(value_losses) r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) self.policy_loss = -tf.reduce_mean( tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]) # For cleaner stats reporting self.abs_policy_loss = tf.abs(self.policy_loss) self.loss = ( self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean( tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
def create_cc_actor(self, hidden_policy, scope): """ Creates Continuous control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ # Create action input (continuous) self.action_holder = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="action_holder") self.external_action_in = self.action_holder scope = self.join_scopes(scope, "policy") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="mu", kernel_initializer=LearningModel.scaled_init(0.01), ) # Policy-dependent log_sigma_sq log_sigma_sq = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, name="log_std", kernel_initializer=LearningModel.scaled_init(0.01), ) self.log_sigma_sq = tf.clip_by_value(log_sigma_sq, LOG_STD_MIN, LOG_STD_MAX) sigma_sq = tf.exp(self.log_sigma_sq) # Do the reparameterization trick policy_ = mu + tf.random_normal(tf.shape(mu)) * sigma_sq _gauss_pre = -0.5 * (((policy_ - mu) / (tf.exp(self.log_sigma_sq) + EPSILON))**2 + 2 * self.log_sigma_sq + np.log(2 * np.pi)) all_probs = tf.reduce_sum(_gauss_pre, axis=1, keepdims=True) self.entropy = tf.reduce_sum(self.log_sigma_sq + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) # Squash probabilities # Keep deterministic around in case we want to use it. self.deterministic_output = tf.tanh(mu) # Note that this is just for symmetry with PPO. self.output_pre = tf.tanh(policy_) # Squash correction all_probs -= tf.reduce_sum(tf.log(1 - self.output_pre**2 + EPSILON), axis=1, keepdims=True) self.all_log_probs = all_probs self.selected_actions = tf.stop_gradient(self.output_pre) self.action_probs = all_probs # Extract output for Barracuda self.output = tf.identity(self.output_pre, name="action") # Get all policy vars self.policy_vars = self.get_vars(scope)