def __init__(self, s_size, a_size, scope, trainer): with tf.variable_scope(scope): # Input and visual encoding layers self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) self.imageIn = tf.reshape(self.inputs, shape=[-1, 64, 64, 1]) self.conv1 = slim.conv2d(activation_fn=tf.nn.elu, inputs=self.imageIn, num_outputs=16, kernel_size=[8, 8], stride=[4, 4], padding='VALID') self.conv2 = slim.conv2d(activation_fn=tf.nn.elu, inputs=self.conv1, num_outputs=32, kernel_size=[4, 4], stride=[2, 2], padding='VALID') hidden = slim.fully_connected(slim.flatten(self.conv2), 256, activation_fn=tf.nn.elu) # Recurrent network for temporal dependencies lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) self.state_in = (c_in, h_in) rnn_in = tf.expand_dims(hidden, [0]) step_size = tf.shape(self.imageIn)[:1] state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 256]) # Output layers for policy and value estimations self.policy = slim.fully_connected( rnn_out, a_size, #activation_fn=tf.nn.softmax, activation_fn=tf.nn.tanh, #TODO Changed. weights_initializer=normalized_columns_initializer(0.1), biases_initializer=None) self.value = slim.fully_connected( rnn_out, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=None) # Only the worker network need ops for loss functions and gradient updating. if scope != 'global': self.actions = tf.placeholder(shape=[None], dtype=tf.float32) # self.actions_onehot = tf.one_hot(self.actions, a_size, dtype=tf.float32) #self.actions_steering = tf.placeholder(shape=[None], dtype=tf.float32) # TODO Changed. self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) #steering=tf.Tensor.eval(self.actions) #self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1]) #self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_steering, [1]) # TODO Changed. # Loss functions self.value_loss = 0.5 * tf.reduce_sum( tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy = -tf.reduce_sum( self.policy * tf.log(self.policy)) #self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs) * self.advantages) self.policy_loss = -tf.reduce_sum( tf.log(self.actions) * self.advantages) self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 # Get gradients from local network using local losses local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) # Apply local gradients to global network global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))
def __init__(self, s_size, s_size_central, number_of_agents, a_size, comm_size_input, comm_size_output, scope, trainer, critic_action=False, critic_comm=False): with tf.variable_scope(scope): print("Scope", scope) if critic_action and critic_comm: central_input_size = [ s_size_central[0] + (number_of_agents - 1) * a_size + comm_size_input ] elif critic_comm: central_input_size = [s_size_central[0] + comm_size_input] elif critic_action: central_input_size = [ s_size_central[0] + (number_of_agents - 1) * a_size ] else: central_input_size = s_size_central self.inputs = tf.placeholder(shape=[ None, ] + s_size, dtype=tf.float32) self.inputs_central = tf.placeholder(shape=[ None, ] + central_input_size, dtype=tf.float32) self.inputs_comm = tf.placeholder(shape=[None, comm_size_input], dtype=tf.float32) flattened_inputs = tf.contrib.layers.flatten(self.inputs) self.flattened_inputs_with_comm = tf.concat( [flattened_inputs, self.inputs_comm], 1) hidden_comm = slim.fully_connected( flattened_inputs, 40, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) #hidden2_comm = slim.fully_connected(hidden_comm, 20, # weights_initializer=tf.contrib.layers.xavier_initializer(), # biases_initializer=tf.contrib.layers.xavier_initializer(), # activation_fn=tf.nn.relu) hidden = slim.fully_connected( self.flattened_inputs_with_comm, 80, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) hidden2 = slim.fully_connected( hidden, 40, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) hidden_central = slim.fully_connected( self.inputs_central, 80, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) hidden2_central = slim.fully_connected( hidden_central, 40, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) self.value = slim.fully_connected( hidden2_central, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=normalized_columns_initializer(1.0)) self.policy = slim.fully_connected( hidden2, a_size, activation_fn=tf.nn.softmax, weights_initializer=normalized_columns_initializer(0.01), biases_initializer=normalized_columns_initializer(0.01)) if comm_size_output != 0: self.message = slim.fully_connected( hidden_comm, comm_size_output, activation_fn=tf.nn.tanh, weights_initializer=tf.contrib.layers.xavier_initializer(), biases_initializer=tf.contrib.layers.xavier_initializer()) else: self.message = slim.fully_connected(hidden_comm, comm_size_output) # Only the worker network need ops for loss functions and gradient updating. if scope != 'global': self.actions = tf.placeholder( shape=[None], dtype=tf.int32) # Index of actions taken self.actions_onehot = tf.one_hot( self.actions, a_size, dtype=tf.float32) # 1-hot tensor of actions taken self.target_v = tf.placeholder( shape=[None], dtype=tf.float32) # Target Value self.advantages = tf.placeholder( shape=[None], dtype=tf.float32) # temporary difference (R-V) self.log_policy = tf.log( tf.clip_by_value(self.policy, 1e-20, 1.0) ) # avoid NaN with clipping when value in policy becomes zero self.responsible_outputs = tf.reduce_sum( self.log_policy * self.actions_onehot, [1]) # Get policy*actions influence self.r_minus_v = self.target_v - tf.reshape( self.value, [-1]) # difference between target value and actual value # Loss functions self.value_loss = 0.5 * tf.reduce_sum(tf.square( self.r_minus_v)) # same as tf.nn.l2_loss(r_minus_v) self.entropy = -tf.reduce_sum( self.policy * self.log_policy) # policy entropy self.policy_loss = -tf.reduce_sum( self.responsible_outputs * self.advantages) # policy loss # loss of message self.target_message = tf.placeholder('float32', [None, comm_size_output], name='target_message') self.loss_m = tf.reduce_mean(tf.square(self.target_message - self.message), name='loss_m') # Learning rate for Critic is half of Actor's, so value_loss/2 + policy loss self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 # Get gradients from local network using local losses self.local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) self.var_norms = tf.global_norm(self.local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) # gradients of "loss" wrt message self.gradients_q_message = tf.gradients( self.policy_loss, self.inputs_comm) # gradients of "target message" wrt weights self.gradients_m_weights = tf.gradients( self.loss_m, self.local_vars) grads_m, self.grad_norms_m = tf.clip_by_global_norm( self.gradients_m_weights, 40.0) # Apply local gradients to global network self.global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = trainer.apply_gradients( zip(grads, self.global_vars)) self.apply_grads_m = trainer.apply_gradients( zip(grads_m, self.global_vars))
def __init__(self, s_size, a_size, scope, trainer, use_conv_layers=False, use_lstm=False): with tf.variable_scope(scope): print("Scope", scope) # Input and visual encoding layers if use_conv_layers: self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) self.imageIn = tf.reshape(self.inputs, shape=[-1, 1280, 800, 1]) self.conv = slim.conv2d( activation_fn=tf.nn.elu, weights_initializer=tf.contrib.layers.xavier_initializer(), # normalized_columns_initializer(0.01), inputs=self.imageIn, num_outputs=8, kernel_size=[3, 3], stride=[1, 1], padding='VALID') self.conv2 = slim.conv2d( activation_fn=tf.nn.elu, weights_initializer=tf.contrib.layers.xavier_initializer(), # normalized_columns_initializer(0.01), inputs=self.imageIn, num_outputs=4, kernel_size=[1, 1], stride=[1, 1], padding='VALID') hidden = slim.fully_connected( slim.flatten(self.conv2), 150, weights_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.elu) hidden2 = slim.fully_connected( hidden, 150, weights_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.elu) else: self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) # hidden = slim.fully_connected(self.inputs, 150, # weights_initializer=tf.contrib.layers.xavier_initializer(), # activation_fn=tf.nn.elu) # hidden2 = slim.fully_connected(hidden, 150, weights_initializer=tf.contrib.layers.xavier_initializer(), # activation_fn=tf.nn.elu) hidden2 = slim.fully_connected( self.inputs, 150, weights_initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.nn.relu) if use_lstm: # Recurrent network for temporal dependencies lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [c_init, h_init] c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) self.state_in = (c_in, h_in) rnn_in = tf.expand_dims( hidden2, [0]) # converts hidden layer [256] to [1, 256] if use_conv_layers: step_size = tf.shape(self.imageIn)[:1] else: step_size = tf.shape(self.inputs)[:1] state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(lstm_outputs, [-1, 256]) # Output layers for policy and value estimations self.policy = slim.fully_connected( rnn_out, a_size, activation_fn=tf.nn.softmax, weights_initializer=normalized_columns_initializer(0.01), biases_initializer=None) self.value = slim.fully_connected( rnn_out, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=None) else: self.state_init = None self.policy = slim.fully_connected( hidden2, a_size, activation_fn=tf.nn.softmax, weights_initializer=normalized_columns_initializer(0.01), biases_initializer=None) self.value = slim.fully_connected( hidden2, 1, activation_fn=None, weights_initializer=normalized_columns_initializer(1.0), biases_initializer=None) # Only the worker network need ops for loss functions and gradient updating. if scope != 'global_square' and scope != 'global_circle': self.actions = tf.placeholder( shape=[None], dtype=tf.int32) # Index of actions taken self.actions_onehot = tf.one_hot( self.actions, a_size, dtype=tf.float32) # 1-hot tensor of actions taken self.target_v = tf.placeholder( shape=[None], dtype=tf.float32) # Target Value self.advantages = tf.placeholder( shape=[None], dtype=tf.float32) # temporary difference (R-V) self.log_policy = tf.log( tf.clip_by_value(self.policy, 1e-20, 1.0) ) # avoid NaN with clipping when value in policy becomes zero self.responsible_outputs = tf.reduce_sum( self.log_policy * self.actions_onehot, [1]) # Get policy*actions influence self.r_minus_v = self.target_v - tf.reshape( self.value, [-1]) # difference between target value and actual value # Loss functions self.value_loss = 0.5 * tf.reduce_sum(tf.square( self.r_minus_v)) # same as tf.nn.l2_loss(r_minus_v) self.entropy = -tf.reduce_sum( self.policy * self.log_policy) # policy entropy self.policy_loss = -tf.reduce_sum( self.responsible_outputs * self.advantages) # policy loss # Learning rate for Critic is half of Actor's, so value_loss/2 + policy loss self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 # Get gradients from local network using local losses self.local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) self.var_norms = tf.global_norm(self.local_vars) grads, self.grad_norms = tf.clip_by_global_norm( self.gradients, 40.0) # Apply local gradients to global network if "square" in scope: global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_square') elif "circle" in scope: global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_circle') else: print("Error on scope build", scope) exit() self.apply_grads = trainer.apply_gradients( zip(grads, global_vars))