def train_step(self, o, r, d, a, sp_batch, weights): # next Q values from t_model to evaluate target_q = self.t_model(sp_batch, training=False) # next Q values from model to select action (Double DQN) another_q = self.model(sp_batch, training=False) idx = tf.math.argmax(another_q, axis=-1) # Then retrieve the q value from target network selected_q = tf.gather(target_q, idx, batch_dims=1) q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \ hp.Q_discount * \ selected_q mask = tf.one_hot(a, self.action_n, dtype=tf.float32) with tf.GradientTape() as tape: q = self.model(o, training=True) q_sa = tf.math.reduce_sum(q * mask, axis=1) unweighted_loss = tf.math.square(q_samp - q_sa) loss = tf.math.reduce_mean(weights * unweighted_loss) if self.total_steps % hp.log_per_steps == 0: tf.summary.scalar('Loss', loss, self.total_steps) priority = (tf.math.abs(q_samp - q_sa) + hp.Buf.epsilon)**hp.Buf.alpha trainable_vars = self.model.trainable_variables gradients = tape.gradient(loss, trainable_vars) self.model.optimizer.apply_gradients(zip(gradients, trainable_vars)) return priority
def train_step(self, o, r, d, a, sp_batch): target_q = self.t_model(sp_batch, training=False) q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \ hp.Q_discount * \ tm.reduce_max(target_q, axis=1) mask = tf.one_hot(a, self.action_n, dtype=tf.float32) with tf.GradientTape() as tape: q = self.model(o, training=True) q_sa = tf.math.reduce_sum(q * mask, axis=1) loss = keras.losses.MSE(q_samp, q_sa) scaled_loss = self.optimizer.get_scaled_loss(loss) trainable_vars = self.model.trainable_variables scaled_gradients = tape.gradient(scaled_loss, trainable_vars) gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) self.optimizer.apply_gradients(zip(gradients, trainable_vars))
def train_step(self, data): o, r, d, a, target_q = data num_actions = target_q.shape[-1] q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \ hp.Q_discount * \ tm.reduce_max(target_q, axis=1) mask = tf.one_hot(a, num_actions, dtype=tf.float32) with tf.GradientTape() as tape: q = self(o, training=True) q_sa = tf.math.reduce_sum(q * mask, axis=1) loss = keras.losses.MSE(q_samp, q_sa) trainable_vars = self.trainable_variables gradients = tape.gradient(loss, trainable_vars) self.optimizer.apply_gradients(zip(gradients, trainable_vars)) self.compiled_metrics.update_state(q_sa, q_samp) return {m.name: m.result() for m in self.metrics}
def train_step(self, o, r, d, a, sp_batch, total_step, weights): target_q = self.t_model(sp_batch, training=False) q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \ hp.Q_discount * \ tm.reduce_max(target_q, axis=1) mask = tf.one_hot(a, self.action_n, dtype=tf.float32) with tf.GradientTape() as tape: q = self.model(o, training=True) q_sa = tf.math.reduce_sum(q * mask, axis=1) # loss = keras.losses.MSE(q_samp, q_sa) unweighted_loss = tf.math.square(q_samp - q_sa) loss = tf.math.reduce_mean(weights * unweighted_loss) tf.summary.scalar('Loss', loss, total_step) scaled_loss = self.model.optimizer.get_scaled_loss(loss) priority = (tf.math.abs(q_samp - q_sa) + hp.Buf.epsilon)**hp.Buf.alpha trainable_vars = self.model.trainable_variables scaled_gradients = tape.gradient(scaled_loss, trainable_vars) gradients = self.model.optimizer.get_unscaled_gradients( scaled_gradients) self.model.optimizer.apply_gradients(zip(gradients, trainable_vars)) return priority