Ejemplo n.º 1
0
 def build_NNs(self):
     with tf.variable_scope("Q"):
         self.Q = self.Q_network(self.state_ph, "DQN")
     with tf.variable_scope("QT"):
         self.QT = self.Q_network(
             self.stateT_ph, "DQNT")
         cops.build_scalar_summary(tf.reduce_max(self.QT, 1)[0], "DQNT", "main/next_Q_0")
         cops.build_scalar_summary(tf.reduce_max(self.QT), "DQNT", "main/next_Q_max")
Ejemplo n.º 2
0
    def build_NNs(self):
        with tf.variable_scope("Q") as scope:
            self.Q = self.Q_network(self.state_ph, "DQN")
            scope.reuse_variables()
            # the network with online weights used to select the actions of the target network
            self.DoubleQT = self.Q_network(self.stateT_ph, "DDQNT")

        with tf.variable_scope("QT"):
            self.QT = self.Q_network(self.stateT_ph, "DQNT")
            cops.build_scalar_summary(
                tf.reduce_max(self.QT, 1)[0], "DQNT", "main/next_Q_0")
            cops.build_scalar_summary(tf.reduce_max(self.QT), "DQNT",
                                      "main/next_Q_max")
Ejemplo n.º 3
0
 def Q_network(self, input_state, Collection):
     conv_stack_shape=[(32,8,4),
                 (64,4,2),
                 (64,3,1)]
     head = tf.div(input_state, 256., name="normalized_input")
     cops.build_activation_summary(head, Collection)
     head = cops.conv_stack(head, conv_stack_shape, self.config, Collection)
     head = cops.flatten(head)
     head = cops.add_relu_layer(head, size=512, Collection=Collection)
     Q = cops.add_linear_layer(head, self.config.action_num, Collection, layer_name="Q")
     # DQN summary
     for i in range(self.config.action_num):
         cops.build_scalar_summary(Q[0, i], Collection, "Q/Q_0_"+str(i))
     return Q
Ejemplo n.º 4
0
    def Q_network(self, input_state, Collection=None):
        conv_stack_shape=[(32,8,4),
                    (64,4,2),
                    (64,3,1)]
        head = tf.div(input_state, 256., name="normalized_input")
        cops.build_activation_summary(head, Collection)
        head = cops.conv_stack(head, conv_stack_shape, self.config, Collection)
        head = cops.flatten(head)
        V_head = cops.add_relu_layer(head, size=512, Collection=Collection)
        V = cops.add_linear_layer(V_head, 1, Collection, layer_name="V")
        A_head = cops.add_relu_layer(head, size=512, Collection=Collection)
        A = cops.add_linear_layer(A_head, self.config.action_num, Collection, layer_name="A")
        Q = tf.add(A, V - tf.expand_dims(tf.reduce_mean(A, axis=1)/self.config.action_num, axis=1) )

        cops.build_scalar_summary(V[0], Collection, "Q/V_0")
        for i in range(self.config.action_num):
            cops.build_scalar_summary(Q[0, i], Collection, "Q/Q_0_"+str(i))
            cops.build_scalar_summary(A[0, i], Collection, "Q/A_0_"+str(i))
        return Q
Ejemplo n.º 5
0
    def train_op(self, Collection):
        with tf.name_scope("loss"):
            # could be done more efficiently with gather_nd or transpose + gather
            action_one_hot = tf.one_hot(
                self.action_ph, self.config.action_num, 1., 0., name='action_one_hot')
            acted_Q = tf.reduce_sum(
                self.Q * action_one_hot, reduction_indices=1, name='DQN_acted')

            Y = self.Q_target()
            Y = tf.stop_gradient(Y)

            loss_batch = cops.clipped_l2(Y, acted_Q)
            loss = tf.reduce_sum(loss_batch, name="loss")

            cops.build_scalar_summary(loss, Collection, "losses/loss")
            cops.build_scalar_summary(loss_batch[0], Collection, "losses/loss_0")
            cops.build_scalar_summary(tf.reduce_max(loss_batch), Collection, "losses/loss_max")
            cops.build_scalar_summary(Y[0], Collection, "main/Y_0")
            cops.build_scalar_summary(tf.reduce_max(Y), Collection, "main/Y_max")
            cops.build_scalar_summary(acted_Q[0], Collection, "main/acted_Q_0")
            cops.build_scalar_summary(tf.reduce_max(acted_Q), Collection, "main/acted_Q_max")
            cops.build_scalar_summary(tf.reduce_max(self.reward_ph), Collection, "main/reward_max")

        train_op, grads = cops.graves_rmsprop_optimizer(
            loss, self.config.learning_rate, 0.95, 0.01, 1)

        for grad, var in grads:
            if grad is True:
                cops.build_hist_summary(grad, Collection, var.op.name + '/gradients')
        return train_op