コード例 #1
0
    def train_step(self, o, r, d, a, sp_batch, weights):
        # next Q values from t_model to evaluate
        target_q = self.t_model(sp_batch, training=False)
        # next Q values from model to select action (Double DQN)
        another_q = self.model(sp_batch, training=False)
        idx = tf.math.argmax(another_q, axis=-1)
        # Then retrieve the q value from target network
        selected_q = tf.gather(target_q, idx, batch_dims=1)

        q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \
                     hp.Q_discount * \
                     selected_q
        mask = tf.one_hot(a, self.action_n, dtype=tf.float32)
        with tf.GradientTape() as tape:
            q = self.model(o, training=True)
            q_sa = tf.math.reduce_sum(q * mask, axis=1)
            unweighted_loss = tf.math.square(q_samp - q_sa)
            loss = tf.math.reduce_mean(weights * unweighted_loss)
            if self.total_steps % hp.log_per_steps == 0:
                tf.summary.scalar('Loss', loss, self.total_steps)

        priority = (tf.math.abs(q_samp - q_sa) + hp.Buf.epsilon)**hp.Buf.alpha
        trainable_vars = self.model.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.model.optimizer.apply_gradients(zip(gradients, trainable_vars))
        return priority
コード例 #2
0
    def train_step(self, o, r, d, a, sp_batch):
        target_q = self.t_model(sp_batch, training=False)
        q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \
                     hp.Q_discount * \
                     tm.reduce_max(target_q, axis=1)
        mask = tf.one_hot(a, self.action_n, dtype=tf.float32)
        with tf.GradientTape() as tape:
            q = self.model(o, training=True)
            q_sa = tf.math.reduce_sum(q * mask, axis=1)
            loss = keras.losses.MSE(q_samp, q_sa)
            scaled_loss = self.optimizer.get_scaled_loss(loss)

        trainable_vars = self.model.trainable_variables
        scaled_gradients = tape.gradient(scaled_loss, trainable_vars)
        gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
コード例 #3
0
    def train_step(self, data):
        o, r, d, a, target_q = data
        num_actions = target_q.shape[-1]
        q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \
                     hp.Q_discount * \
                     tm.reduce_max(target_q, axis=1)
        mask = tf.one_hot(a, num_actions, dtype=tf.float32)

        with tf.GradientTape() as tape:
            q = self(o, training=True)
            q_sa = tf.math.reduce_sum(q * mask, axis=1)
            loss = keras.losses.MSE(q_samp, q_sa)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(q_sa, q_samp)
        return {m.name: m.result() for m in self.metrics}
コード例 #4
0
ファイル: Agent.py プロジェクト: jaentrouble/mouse_test_7
    def train_step(self, o, r, d, a, sp_batch, total_step, weights):
        target_q = self.t_model(sp_batch, training=False)
        q_samp = r + tf.cast(tm.logical_not(d), tf.float32) * \
                     hp.Q_discount * \
                     tm.reduce_max(target_q, axis=1)
        mask = tf.one_hot(a, self.action_n, dtype=tf.float32)
        with tf.GradientTape() as tape:
            q = self.model(o, training=True)
            q_sa = tf.math.reduce_sum(q * mask, axis=1)
            # loss = keras.losses.MSE(q_samp, q_sa)
            unweighted_loss = tf.math.square(q_samp - q_sa)
            loss = tf.math.reduce_mean(weights * unweighted_loss)
            tf.summary.scalar('Loss', loss, total_step)
            scaled_loss = self.model.optimizer.get_scaled_loss(loss)

        priority = (tf.math.abs(q_samp - q_sa) + hp.Buf.epsilon)**hp.Buf.alpha
        trainable_vars = self.model.trainable_variables
        scaled_gradients = tape.gradient(scaled_loss, trainable_vars)
        gradients = self.model.optimizer.get_unscaled_gradients(
            scaled_gradients)
        self.model.optimizer.apply_gradients(zip(gradients, trainable_vars))
        return priority