def create_forward_model( self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor ) -> None: """ Creates forward model TensorFlow ops for Curiosity module. Predicts encoded future state based on encoded current state and given action. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat( [encoded_state, self.policy.selected_actions], axis=1 ) hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) pred_next_state = tf.layers.dense( hidden, self.encoding_size * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)), activation=None, ) squared_difference = 0.5 * tf.reduce_sum( tf.squared_difference(pred_next_state, encoded_next_state), axis=1 ) self.intrinsic_reward = squared_difference self.forward_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1] )
def create_loss(self, learning_rate: float, anneal_steps: int) -> None: """ Creates the loss and update nodes for the BC module :param learning_rate: The learning rate for the optimizer :param anneal_steps: Number of steps over which to anneal the learning_rate """ selected_action = self.policy.output if self.policy.use_continuous_act: self.loss = tf.reduce_mean( tf.squared_difference(selected_action, self.expert_action)) else: log_probs = self.policy.all_log_probs self.loss = tf.reduce_mean( -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action) if anneal_steps > 0: self.annealed_learning_rate = tf.train.polynomial_decay( learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0) else: self.annealed_learning_rate = tf.Variable(learning_rate) optimizer = tf.train.AdamOptimizer( learning_rate=self.annealed_learning_rate, name="bc_adam") self.update_batch = optimizer.minimize(self.loss)
def create_inverse_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor) -> None: """ Creates inverse model TensorFlow ops for Curiosity module. Predicts action taken given current and future encoded states. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) if self.policy_model.brain.vector_action_space_type == "continuous": pred_action = tf.layers.dense(hidden, self.policy_model.act_size[0], activation=None) squared_difference = tf.reduce_sum( tf.squared_difference(pred_action, self.policy_model.selected_actions), axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]) else: pred_action = tf.concat( [ tf.layers.dense(hidden, self.policy_model.act_size[i], activation=tf.nn.softmax) for i in range(len(self.policy_model.act_size)) ], axis=1, ) cross_entropy = tf.reduce_sum( -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions, axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1])
def _create_losses(self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step): """ Creates training-specific Tensorflow ops for PPO models. :param probs: Current policy probabilities :param old_probs: Past policy probabilities :param value_heads: Value estimate tensors from each value stream :param beta: Entropy regularization strength :param entropy: Current policy entropy :param epsilon: Value for policy-divergence threshold :param lr: Learning rate :param max_step: Total number of training steps. """ self.returns_holders = {} self.old_values = {} for name in value_heads.keys(): returns_holder = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_returns".format(name)) old_value = tf.placeholder(shape=[None], dtype=tf.float32, name="{}_value_estimate".format(name)) self.returns_holders[name] = returns_holder self.old_values[name] = old_value self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="advantages") advantage = tf.expand_dims(self.advantage, -1) decay_epsilon = tf.train.polynomial_decay(epsilon, self.policy.global_step, max_step, 0.1, power=1.0) decay_beta = tf.train.polynomial_decay(beta, self.policy.global_step, max_step, 1e-5, power=1.0) value_losses = [] for name, head in value_heads.items(): clipped_value_estimate = self.old_values[name] + tf.clip_by_value( tf.reduce_sum(head, axis=1) - self.old_values[name], -decay_epsilon, decay_epsilon, ) v_opt_a = tf.squared_difference(self.returns_holders[name], tf.reduce_sum(head, axis=1)) v_opt_b = tf.squared_difference(self.returns_holders[name], clipped_value_estimate) value_loss = tf.reduce_mean( tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[1]) value_losses.append(value_loss) self.value_loss = tf.reduce_mean(value_losses) r_theta = tf.exp(probs - old_probs) p_opt_a = r_theta * advantage p_opt_b = (tf.clip_by_value(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) self.policy_loss = -tf.reduce_mean( tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]) # For cleaner stats reporting self.abs_policy_loss = tf.abs(self.policy_loss) self.loss = ( self.policy_loss + 0.5 * self.value_loss - decay_beta * tf.reduce_mean( tf.dynamic_partition(entropy, self.policy.mask, 2)[1]))
def _create_losses( self, q1_streams: Dict[str, tf.Tensor], q2_streams: Dict[str, tf.Tensor], lr: tf.Tensor, max_step: int, stream_names: List[str], discrete: bool = False, ) -> None: """ Creates training-specific Tensorflow ops for SAC models. :param q1_streams: Q1 streams from policy network :param q1_streams: Q2 streams from policy network :param lr: Learning rate :param max_step: Total number of training steps. :param stream_names: List of reward stream names. :param discrete: Whether or not to use discrete action losses. """ if discrete: self.target_entropy = [ self.discrete_target_entropy_scale * np.log(i).astype(np.float32) for i in self.act_size ] discrete_action_probs = tf.exp(self.policy.all_log_probs) per_action_entropy = discrete_action_probs * self.policy.all_log_probs else: self.target_entropy = ( -1 * self.continuous_target_entropy_scale * np.prod(self.act_size[0]).astype(np.float32)) self.rewards_holders = {} self.min_policy_qs = {} for name in stream_names: if discrete: _branched_mpq1 = ModelUtils.break_into_branches( self.policy_network.q1_pheads[name] * discrete_action_probs, self.act_size, ) branched_mpq1 = tf.stack([ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq1 ]) _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0) _branched_mpq2 = ModelUtils.break_into_branches( self.policy_network.q2_pheads[name] * discrete_action_probs, self.act_size, ) branched_mpq2 = tf.stack([ tf.reduce_sum(_br, axis=1, keep_dims=True) for _br in _branched_mpq2 ]) _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0) self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean) else: self.min_policy_qs[name] = tf.minimum( self.policy_network.q1_pheads[name], self.policy_network.q2_pheads[name], ) rewards_holder = tf.placeholder(shape=[None], dtype=tf.float32, name=f"{name}_rewards") self.rewards_holders[name] = rewards_holder q1_losses = [] q2_losses = [] # Multiple q losses per stream expanded_dones = tf.expand_dims(self.dones_holder, axis=-1) for i, name in enumerate(stream_names): _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1) q_backup = tf.stop_gradient( _expanded_rewards + (1.0 - self.use_dones_in_backup[name] * expanded_dones) * self.gammas[i] * self.target_network.value_heads[name]) if discrete: # We need to break up the Q functions by branch, and update them individually. branched_q1_stream = ModelUtils.break_into_branches( self.policy.selected_actions * q1_streams[name], self.act_size) branched_q2_stream = ModelUtils.break_into_branches( self.policy.selected_actions * q2_streams[name], self.act_size) # Reduce each branch into scalar branched_q1_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q1_stream ] branched_q2_stream = [ tf.reduce_sum(_branch, axis=1, keep_dims=True) for _branch in branched_q2_stream ] q1_stream = tf.reduce_mean(branched_q1_stream, axis=0) q2_stream = tf.reduce_mean(branched_q2_stream, axis=0) else: q1_stream = q1_streams[name] q2_stream = q2_streams[name] _q1_loss = 0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference(q_backup, q1_stream)) _q2_loss = 0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference(q_backup, q2_stream)) q1_losses.append(_q1_loss) q2_losses.append(_q2_loss) self.q1_loss = tf.reduce_mean(q1_losses) self.q2_loss = tf.reduce_mean(q2_losses) # Learn entropy coefficient if discrete: # Create a log_ent_coef for each branch self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(np.float32), trainable=True, ) else: self.log_ent_coef = tf.get_variable( "log_ent_coef", dtype=tf.float32, initializer=np.log(self.init_entcoef).astype(np.float32), trainable=True, ) self.ent_coef = tf.exp(self.log_ent_coef) if discrete: # We also have to do a different entropy and target_entropy per branch. branched_per_action_ent = ModelUtils.break_into_branches( per_action_entropy, self.act_size) branched_ent_sums = tf.stack( [ tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te for _lp, _te in zip(branched_per_action_ent, self.target_entropy) ], axis=1, ) self.entropy_loss = -tf.reduce_mean( tf.to_float(self.policy.mask) * tf.reduce_mean( self.log_ent_coef * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2), axis=1, )) # Same with policy loss, we have to do the loss per branch and average them, # so that larger branches don't get more weight. # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q branched_q_term = ModelUtils.break_into_branches( discrete_action_probs * self.policy_network.q1_p, self.act_size) branched_policy_loss = tf.stack([ tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True) for i, (_lp, _qt) in enumerate( zip(branched_per_action_ent, branched_q_term)) ]) self.policy_loss = tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)) # Do vbackup entropy bonus per branch as well. branched_ent_bonus = tf.stack([ tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True) for i, _lp in enumerate(branched_per_action_ent) ]) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_mean(branched_ent_bonus, axis=0)) value_losses.append(0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup))) else: self.entropy_loss = -tf.reduce_mean( self.log_ent_coef * tf.to_float(self.policy.mask) * tf.stop_gradient( tf.reduce_sum( self.policy.all_log_probs + self.target_entropy, axis=1, keep_dims=True, ))) batch_policy_loss = tf.reduce_mean( self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p, axis=1, ) self.policy_loss = tf.reduce_mean( tf.to_float(self.policy.mask) * batch_policy_loss) value_losses = [] for name in stream_names: v_backup = tf.stop_gradient( self.min_policy_qs[name] - tf.reduce_sum( self.ent_coef * self.policy.all_log_probs, axis=1)) value_losses.append(0.5 * tf.reduce_mean( tf.to_float(self.policy.mask) * tf.squared_difference( self.policy_network.value_heads[name], v_backup))) self.value_loss = tf.reduce_mean(value_losses) self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss self.entropy = self.policy_network.entropy
def __init__( self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False, seed=0, ): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) num_streams = 1 hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder( dtype=tf.float32, shape=[], name="dropout_rate" ) hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in" ) hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in, self.sequence_length ) self.memory_out = tf.identity(self.memory_out, name="recurrent_out") if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_reg, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling(0.01), ) ) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs", ) self.action_masks = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" ) self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer( tf.concat(policy_branches, axis=1), self.action_masks, self.act_size ) tf.identity(normalized_logits, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action", ) self.action_oh = tf.concat( [ tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.loss = tf.reduce_sum( -tf.log(self.action_probs + 1e-10) * self.action_oh ) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action, ), tf.float32, ) ) else: self.policy = tf.layers.dense( hidden_reg, self.act_size[0], activation=None, use_bias=False, name="pre_action", kernel_initializer=tf.initializers.variance_scaling(0.01), ) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action" ) self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1) self.loss = tf.reduce_sum( tf.squared_difference(self.clipped_true_action, self.sample_action) ) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)