def train(self, obs_n, act_n): with tf.GradientTape() as tape: x = obs_n[self.agent_index] for idx in range(self.num_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) act_n = tf.unstack(act_n) if self.use_gumbel: logits = x # log probabilities of the gumbel softmax dist are the output of the network act_n[self.agent_index] = self.gumbel_softmax_sample(logits) act_probs = tf.math.softmax(logits) entropy = -tf.math.reduce_sum( act_probs * tf.math.log(act_probs + self.numeric_eps), 1) elif self.use_gaussian: logits = x act_n[self.agent_index] = self.gaussian_sample(logits) entropy = -self.action_logprob(obs_n[self.agent_index], act_n[self.agent_index]) q_value = self.q_network._predict_internal(obs_n + act_n) loss = -tf.math.reduce_mean(q_value + self.entropy_coeff * entropy) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients( zip(local_clipped, self.model.trainable_variables)) return loss
def _train_step_internal(self, concatenated_input, target_q, weights): """ Internal function, because concatenation can not be done inside tf.function """ with tf.GradientTape() as tape: x = self.input_concat_layer(concatenated_input) for idx in range(self.num_layers): x = self.hidden_layers[idx](x) q_pred = self.output_layer(x) td_loss = tf.math.square(target_q - q_pred) loss = tf.reduce_mean(td_loss * weights) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables)) return td_loss
def train(self, obs_n, act_n): with tf.GradientTape() as tape: x = self.forward_pass(obs_n[self.agent_index]) act_n = tf.unstack(act_n) if self.use_gumbel: logits = x # log probabilities of the gumbel softmax dist are the output of the network act_n[self.agent_index] = self.gumbel_softmax_sample(logits) else: act_n[self.agent_index] = x q_value = self.q_network._predict_internal(obs_n + act_n) policy_regularization = tf.math.reduce_mean(tf.math.square(x)) loss = -tf.math.reduce_mean(q_value) + 1e-3 * policy_regularization # gradient plus regularization gradients = tape.gradient(loss, self.model.trainable_variables) # todo not sure if this really works # gradients = tf.clip_by_global_norm(gradients, self.clip_norm)[0] local_clipped = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables)) return loss
def _train_step_internal(self, concatenated_input, target_prob, weights): """ Internal function, because concatenation can not be done inside tf.function. """ with tf.GradientTape(persistent=True) as tape: x = self.input_concat_layer(concatenated_input) for idx in range(self.num_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) q_pred = x crossent_loss = tf.losses.binary_crossentropy(target_prob, q_pred) loss = crossent_loss gradients = tape.gradient(loss, self.model.trainable_variables) gradients = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) return crossent_loss
def train_step(self, obs_n, target, weights): """ Train the value function estimator, for one gradient step. With clipped gradients. Internal function, because concatenation can not be done inside tf.function """ with tf.GradientTape() as tape: x = self.input_concat_layer(obs_n) if len(obs_n) > 1 else obs_n[0] for idx in range(self.num_layers): x = self.hidden_layers[idx](x) v_pred = self.output_layer(x) td_loss = tf.math.square(target - v_pred) loss = tf.reduce_mean(td_loss * weights) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients( zip(local_clipped, self.model.trainable_variables)) return td_loss