Exemple #1
0
    def __init__(self, s_size, a_size, scope, trainer):
        with tf.variable_scope(scope):
            # Input and visual encoding layers
            self.inputs = tf.placeholder(shape=[None, s_size],
                                         dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs, shape=[-1, 64, 64, 1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                                     inputs=self.imageIn,
                                     num_outputs=16,
                                     kernel_size=[8, 8],
                                     stride=[4, 4],
                                     padding='VALID')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                                     inputs=self.conv1,
                                     num_outputs=32,
                                     kernel_size=[4, 4],
                                     stride=[2, 2],
                                     padding='VALID')
            hidden = slim.fully_connected(slim.flatten(self.conv2),
                                          256,
                                          activation_fn=tf.nn.elu)

            # Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell,
                rnn_in,
                initial_state=state_in,
                sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])

            # Output layers for policy and value estimations
            self.policy = slim.fully_connected(
                rnn_out,
                a_size,
                #activation_fn=tf.nn.softmax,
                activation_fn=tf.nn.tanh,  #TODO Changed.
                weights_initializer=normalized_columns_initializer(0.1),
                biases_initializer=None)
            self.value = slim.fully_connected(
                rnn_out,
                1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)

            # Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None], dtype=tf.float32)
                # self.actions_onehot = tf.one_hot(self.actions, a_size, dtype=tf.float32)
                #self.actions_steering = tf.placeholder(shape=[None], dtype=tf.float32)  # TODO Changed.
                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],
                                                 dtype=tf.float32)
                #steering=tf.Tensor.eval(self.actions)
                #self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])
                #self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_steering, [1])  # TODO Changed.

                # Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(
                    tf.square(self.target_v - tf.reshape(self.value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.policy * tf.log(self.policy))
                #self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs) * self.advantages)
                self.policy_loss = -tf.reduce_sum(
                    tf.log(self.actions) * self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                # Get gradients from local network using local losses
                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, 40.0)

                # Apply local gradients to global network
                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
Exemple #2
0
    def __init__(self,
                 s_size,
                 s_size_central,
                 number_of_agents,
                 a_size,
                 comm_size_input,
                 comm_size_output,
                 scope,
                 trainer,
                 critic_action=False,
                 critic_comm=False):
        with tf.variable_scope(scope):
            print("Scope", scope)
            if critic_action and critic_comm:
                central_input_size = [
                    s_size_central[0] + (number_of_agents - 1) * a_size +
                    comm_size_input
                ]
            elif critic_comm:
                central_input_size = [s_size_central[0] + comm_size_input]
            elif critic_action:
                central_input_size = [
                    s_size_central[0] + (number_of_agents - 1) * a_size
                ]
            else:
                central_input_size = s_size_central

            self.inputs = tf.placeholder(shape=[
                None,
            ] + s_size,
                                         dtype=tf.float32)
            self.inputs_central = tf.placeholder(shape=[
                None,
            ] + central_input_size,
                                                 dtype=tf.float32)
            self.inputs_comm = tf.placeholder(shape=[None, comm_size_input],
                                              dtype=tf.float32)

            flattened_inputs = tf.contrib.layers.flatten(self.inputs)
            self.flattened_inputs_with_comm = tf.concat(
                [flattened_inputs, self.inputs_comm], 1)

            hidden_comm = slim.fully_connected(
                flattened_inputs,
                40,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.contrib.layers.xavier_initializer(),
                activation_fn=tf.nn.relu)

            #hidden2_comm = slim.fully_connected(hidden_comm, 20,
            #                                   weights_initializer=tf.contrib.layers.xavier_initializer(),
            #                                   biases_initializer=tf.contrib.layers.xavier_initializer(),
            #                                   activation_fn=tf.nn.relu)

            hidden = slim.fully_connected(
                self.flattened_inputs_with_comm,
                80,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.contrib.layers.xavier_initializer(),
                activation_fn=tf.nn.relu)

            hidden2 = slim.fully_connected(
                hidden,
                40,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.contrib.layers.xavier_initializer(),
                activation_fn=tf.nn.relu)

            hidden_central = slim.fully_connected(
                self.inputs_central,
                80,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.contrib.layers.xavier_initializer(),
                activation_fn=tf.nn.relu)

            hidden2_central = slim.fully_connected(
                hidden_central,
                40,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=tf.contrib.layers.xavier_initializer(),
                activation_fn=tf.nn.relu)

            self.value = slim.fully_connected(
                hidden2_central,
                1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=normalized_columns_initializer(1.0))
            self.policy = slim.fully_connected(
                hidden2,
                a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=normalized_columns_initializer(0.01))
            if comm_size_output != 0:
                self.message = slim.fully_connected(
                    hidden_comm,
                    comm_size_output,
                    activation_fn=tf.nn.tanh,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    biases_initializer=tf.contrib.layers.xavier_initializer())
            else:
                self.message = slim.fully_connected(hidden_comm,
                                                    comm_size_output)

            # Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(
                    shape=[None], dtype=tf.int32)  # Index of actions taken
                self.actions_onehot = tf.one_hot(
                    self.actions, a_size,
                    dtype=tf.float32)  # 1-hot tensor of actions taken
                self.target_v = tf.placeholder(
                    shape=[None], dtype=tf.float32)  # Target Value
                self.advantages = tf.placeholder(
                    shape=[None],
                    dtype=tf.float32)  # temporary difference (R-V)

                self.log_policy = tf.log(
                    tf.clip_by_value(self.policy, 1e-20, 1.0)
                )  # avoid NaN with clipping when value in policy becomes zero
                self.responsible_outputs = tf.reduce_sum(
                    self.log_policy * self.actions_onehot,
                    [1])  # Get policy*actions influence
                self.r_minus_v = self.target_v - tf.reshape(
                    self.value,
                    [-1])  # difference between target value and actual value

                # Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(
                    self.r_minus_v))  # same as tf.nn.l2_loss(r_minus_v)
                self.entropy = -tf.reduce_sum(
                    self.policy * self.log_policy)  # policy entropy
                self.policy_loss = -tf.reduce_sum(
                    self.responsible_outputs * self.advantages)  # policy loss

                # loss of message
                self.target_message = tf.placeholder('float32',
                                                     [None, comm_size_output],
                                                     name='target_message')
                self.loss_m = tf.reduce_mean(tf.square(self.target_message -
                                                       self.message),
                                             name='loss_m')

                # Learning rate for Critic is half of Actor's, so value_loss/2 + policy loss
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                # Get gradients from local network using local losses
                self.local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, self.local_vars)
                self.var_norms = tf.global_norm(self.local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, 40.0)

                # gradients of "loss" wrt message
                self.gradients_q_message = tf.gradients(
                    self.policy_loss, self.inputs_comm)
                # gradients of "target message" wrt weights
                self.gradients_m_weights = tf.gradients(
                    self.loss_m, self.local_vars)
                grads_m, self.grad_norms_m = tf.clip_by_global_norm(
                    self.gradients_m_weights, 40.0)

                # Apply local gradients to global network
                self.global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, self.global_vars))
                self.apply_grads_m = trainer.apply_gradients(
                    zip(grads_m, self.global_vars))
Exemple #3
0
    def __init__(self,
                 s_size,
                 a_size,
                 scope,
                 trainer,
                 use_conv_layers=False,
                 use_lstm=False):
        with tf.variable_scope(scope):
            print("Scope", scope)

            # Input and visual encoding layers
            if use_conv_layers:
                self.inputs = tf.placeholder(shape=[None, s_size],
                                             dtype=tf.float32)
                self.imageIn = tf.reshape(self.inputs,
                                          shape=[-1, 1280, 800, 1])

                self.conv = slim.conv2d(
                    activation_fn=tf.nn.elu,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    # normalized_columns_initializer(0.01),
                    inputs=self.imageIn,
                    num_outputs=8,
                    kernel_size=[3, 3],
                    stride=[1, 1],
                    padding='VALID')
                self.conv2 = slim.conv2d(
                    activation_fn=tf.nn.elu,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    # normalized_columns_initializer(0.01),
                    inputs=self.imageIn,
                    num_outputs=4,
                    kernel_size=[1, 1],
                    stride=[1, 1],
                    padding='VALID')
                hidden = slim.fully_connected(
                    slim.flatten(self.conv2),
                    150,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    activation_fn=tf.nn.elu)
                hidden2 = slim.fully_connected(
                    hidden,
                    150,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    activation_fn=tf.nn.elu)

            else:
                self.inputs = tf.placeholder(shape=[None, s_size],
                                             dtype=tf.float32)
                # hidden = slim.fully_connected(self.inputs, 150,
                #                              weights_initializer=tf.contrib.layers.xavier_initializer(),
                #                              activation_fn=tf.nn.elu)
                # hidden2 = slim.fully_connected(hidden, 150, weights_initializer=tf.contrib.layers.xavier_initializer(),
                #                               activation_fn=tf.nn.elu)

                hidden2 = slim.fully_connected(
                    self.inputs,
                    150,
                    weights_initializer=tf.contrib.layers.xavier_initializer(),
                    activation_fn=tf.nn.relu)

            if use_lstm:
                # Recurrent network for temporal dependencies
                lstm_cell = tf.contrib.rnn.BasicLSTMCell(256,
                                                         state_is_tuple=True)
                c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
                h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
                self.state_init = [c_init, h_init]
                c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
                h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
                self.state_in = (c_in, h_in)
                rnn_in = tf.expand_dims(
                    hidden2, [0])  # converts hidden layer [256] to [1, 256]
                if use_conv_layers:
                    step_size = tf.shape(self.imageIn)[:1]
                else:
                    step_size = tf.shape(self.inputs)[:1]
                state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
                lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                    lstm_cell,
                    rnn_in,
                    initial_state=state_in,
                    sequence_length=step_size,
                    time_major=False)
                lstm_c, lstm_h = lstm_state
                self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
                rnn_out = tf.reshape(lstm_outputs, [-1, 256])

                # Output layers for policy and value estimations
                self.policy = slim.fully_connected(
                    rnn_out,
                    a_size,
                    activation_fn=tf.nn.softmax,
                    weights_initializer=normalized_columns_initializer(0.01),
                    biases_initializer=None)
                self.value = slim.fully_connected(
                    rnn_out,
                    1,
                    activation_fn=None,
                    weights_initializer=normalized_columns_initializer(1.0),
                    biases_initializer=None)
            else:
                self.state_init = None

                self.policy = slim.fully_connected(
                    hidden2,
                    a_size,
                    activation_fn=tf.nn.softmax,
                    weights_initializer=normalized_columns_initializer(0.01),
                    biases_initializer=None)
                self.value = slim.fully_connected(
                    hidden2,
                    1,
                    activation_fn=None,
                    weights_initializer=normalized_columns_initializer(1.0),
                    biases_initializer=None)

            # Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global_square' and scope != 'global_circle':
                self.actions = tf.placeholder(
                    shape=[None], dtype=tf.int32)  # Index of actions taken
                self.actions_onehot = tf.one_hot(
                    self.actions, a_size,
                    dtype=tf.float32)  # 1-hot tensor of actions taken
                self.target_v = tf.placeholder(
                    shape=[None], dtype=tf.float32)  # Target Value
                self.advantages = tf.placeholder(
                    shape=[None],
                    dtype=tf.float32)  # temporary difference (R-V)

                self.log_policy = tf.log(
                    tf.clip_by_value(self.policy, 1e-20, 1.0)
                )  # avoid NaN with clipping when value in policy becomes zero
                self.responsible_outputs = tf.reduce_sum(
                    self.log_policy * self.actions_onehot,
                    [1])  # Get policy*actions influence
                self.r_minus_v = self.target_v - tf.reshape(
                    self.value,
                    [-1])  # difference between target value and actual value

                # Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(
                    self.r_minus_v))  # same as tf.nn.l2_loss(r_minus_v)
                self.entropy = -tf.reduce_sum(
                    self.policy * self.log_policy)  # policy entropy
                self.policy_loss = -tf.reduce_sum(
                    self.responsible_outputs * self.advantages)  # policy loss

                # Learning rate for Critic is half of Actor's, so value_loss/2 + policy loss
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                # Get gradients from local network using local losses
                self.local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, self.local_vars)
                self.var_norms = tf.global_norm(self.local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, 40.0)

                # Apply local gradients to global network
                if "square" in scope:
                    global_vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES, 'global_square')
                elif "circle" in scope:
                    global_vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES, 'global_circle')
                else:
                    print("Error on scope build", scope)
                    exit()
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))