def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr, obs_n_shape, act_shape_n, act_type, wd, agent_index): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.num_layers = num_hidden_layers self.lr = lr self.obs_shape_n = obs_n_shape self.act_shape_n = act_shape_n self.act_type = act_type self.clip_norm = 0.5 # self.optimizer = tf.keras.optimizers.Adam(lr=self.lr) self.optimizer = AdamW(learning_rate=lr, weight_decay=wd) self.no_neighbors = no_neighbors self.no_agents = len(self.obs_shape_n) self.no_features = self.obs_shape_n[0][0] self.no_actions = self.act_shape_n[0][0] # GAT self.k_lst = list(range(self.no_neighbors + 2))[2:] self.graph_input = tf.keras.layers.Input( (self.no_agents, self.no_features + self.no_actions), name="graph_input") self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj") self.gcn = GCNConv( units_per_layer, kernel_initializer=tf.keras.initializers.he_uniform(), activation=tf.keras.layers.LeakyReLU(alpha=0.1), use_bias=False)([self.graph_input, self.adj]) self.hidden_layers = [] for idx in range(2): layer = tf.keras.layers.Dense(units_per_layer, activation='relu') self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear') # Try ResNet Alternative # self.flatten = tf.keras.layers.Flatten()(self.gat) self.concat = tf.keras.layers.Concatenate(axis=2)( [self.graph_input, self.gcn]) self.flatten = tf.keras.layers.Flatten()(self.concat) x = self.flatten for idx in range(2): x = self.hidden_layers[idx](x) x = self.output_layer(x) # connect layers self.model = tf.keras.Model( inputs=[self.graph_input, self.adj], # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse')
def __init__(self, num_hidden_layers, units_per_layer, lr, obs_n_shape, act_shape_n, act_type, agent_index): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.num_layers = num_hidden_layers self.lr = lr self.obs_shape_n = obs_n_shape self.act_shape_n = act_shape_n self.act_type = act_type self.clip_norm = 0.5 self.wd = 1e-5 self.optimizer = AdamW(learning_rate=lr, weight_decay=self.wd) # set up layers # each agent's action and obs are treated as separate inputs self.obs_input_n = [] for idx, shape in enumerate(self.obs_shape_n): self.obs_input_n.append( tf.keras.layers.Input(shape=shape, name='obs_in' + str(idx))) self.act_input_n = [] for idx, shape in enumerate(self.act_shape_n): self.act_input_n.append( tf.keras.layers.Input(shape=shape, name='act_in' + str(idx))) self.input_concat_layer = tf.keras.layers.Concatenate() self.hidden_layers = [] for idx in range(num_hidden_layers): layer = tf.keras.layers.Dense(units_per_layer, activation='relu', name='ag{}crit_hid{}'.format( agent_index, idx)) self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear', name='ag{}crit_out{}'.format( agent_index, idx)) # connect layers x = self.input_concat_layer(self.obs_input_n + self.act_input_n) for idx in range(self.num_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) self.model = tf.keras.Model( inputs=self.obs_input_n + self.act_input_n, # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse')
def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr, obs_n_shape, act_shape_n, act_type, agent_index): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.num_layers = num_hidden_layers self.lr = lr self.obs_shape_n = obs_n_shape self.act_shape_n = act_shape_n self.act_type = act_type self.clip_norm = 0.5 self.wd = 1e-5 self.optimizer = AdamW(learning_rate=lr, weight_decay=self.wd) self.no_neighbors = no_neighbors self.no_agents = len(self.obs_shape_n) self.no_features = self.obs_shape_n[0][0] self.no_actions = self.act_shape_n[0][0] # GAT self.k_lst = list(range(self.no_neighbors + 2))[2:] self.graph_input = tf.keras.layers.Input((self.no_agents, self.no_features + self.no_actions), name="graph_input") self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj") # (2, (None, 15)) self.gat = GATConv( units_per_layer, activation='elu', attn_heads=2, concat_heads=True, )([self.graph_input, self.adj]) self.hidden_layers = [] for idx in range(2): layer = tf.keras.layers.Dense(units_per_layer, activation='relu') self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear') self.flatten = tf.keras.layers.Flatten()(self.gat) x = self.flatten for idx in range(2): x = self.hidden_layers[idx](x) x = self.output_layer(x) # connect layers self.model = tf.keras.Model(inputs=[self.graph_input, self.adj], # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse')
def main(arglist): global no_actions, no_features, no_agents env = u.make_env(arglist.scenario, arglist.no_agents) obs_shape_n = env.observation_space act_shape_n = env.action_space act_shape_n = u.space_n_to_shape_n(act_shape_n) no_agents = env.n batch_size = arglist.batch_size no_neighbors = arglist.no_neighbors k_lst = list(range(no_neighbors + 2))[2:] # [2,3] u.create_seed(arglist.seed) noise_mode = OUNoise(act_shape_n[0], scale=1.0) noise = 0.1 reduction_noise = 0.999 # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9 no_features = obs_shape_n[0].shape[0] no_actions = act_shape_n[0][0] model, model_t = __build_conf() optimizer = AdamW(learning_rate=arglist.lr, weight_decay=1e-5) # Results episode_rewards = [0.0] # sum of rewards for all agents result_path = os.path.join("results", arglist.exp_name) res = os.path.join(result_path, " %s.csv" % arglist.exp_name) if not os.path.exists(result_path): os.makedirs(result_path) replay_buffer = ReplayBuffer(arglist.max_buffer_size) # Init Buffer episode_step = 0 train_step = 0 t_start = time.time() obs_n = env.reset() adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True) print('Starting iterations...') while True: episode_step += 1 terminal = (episode_step >= arglist.max_episode_len) if episode_step % 3 == 0: adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True) predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj, model) actions = get_actions(predictions, noise, noise_mode) # Observe next state, reward and done value new_obs_n, rew_n, done_n, _ = env.step(actions) done = all(done_n) or terminal cooperative_reward = rew_n[0] # Store the data in the replay memory replay_buffer.add(obs_n, adj, actions, cooperative_reward, new_obs_n, done) obs_n = new_obs_n episode_rewards[-1] += cooperative_reward if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) # increment global step counter train_step += 1 # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # Train the models train_cond = not arglist.display if train_cond and len(replay_buffer) > arglist.batch_size: if len( episode_rewards ) % arglist.update_rate == 0: # only update every 30 episodes for _ in range(arglist.update_times): state, adj_n, actions, rewards, new_state, dones = replay_buffer.sample( batch_size) noise *= reduction_noise # Calculate TD-target with tf.GradientTape() as tape: target_q_values = model_t([new_state, adj_n]) # Apply max(Q) to obtain the TD-target target_q_tot = tf.reduce_max(target_q_values, axis=-1) # Apply VDN to reduce the agent-dimension max_q_tot = tf.reduce_sum(target_q_tot, axis=-1) y = rewards + (1. - dones) * arglist.gamma * max_q_tot # Predictions action_one_hot = tf.one_hot( tf.argmax(actions, axis=2, name='action_one_hot'), no_actions) q_values = model([state, adj_n]) q_tot = tf.reduce_sum(q_values * action_one_hot, axis=-1, name='q_acted') pred = tf.reduce_sum(q_tot, axis=1) if "huber" in arglist.loss_type: loss = tf.reduce_sum( u.huber_loss(pred, tf.stop_gradient(y))) elif "mse" in arglist.loss_type: loss = tf.losses.mean_squared_error( pred, tf.stop_gradient(y)) else: raise RuntimeError( "Loss function should be either Huber or MSE. %s found!" % arglist.loss_type) gradients = tape.gradient(loss, model.trainable_variables) local_clipped = u.clip_by_local_norm(gradients, 0.1) optimizer.apply_gradients( zip(local_clipped, model.trainable_variables)) tf.saved_model.save(model, result_path) # display training output if train_step % arglist.save_rate == 0: # eval_reward = get_eval_reward(env, model) with open(res, "a+") as f: mes_dict = { "steps": train_step, "episodes": len(episode_rewards), "train_episode_reward": np.round(np.mean(episode_rewards[-arglist.save_rate:]), 3), # "eval_episode_reward": np.round(np.mean(eval_reward), 3), "time": round(time.time() - t_start, 3) } print(mes_dict) for item in list(mes_dict.values()): f.write("%s\t" % item) f.write("\n") f.close() t_start = time.time() # train target model if arglist.soft_update: weights = model.get_weights() target_weights = model_t.get_weights() for w in range(len(weights)): target_weights[w] = arglist.tau * weights[w] + ( 1 - arglist.tau) * target_weights[w] model_t.set_weights(target_weights) elif terminal and train_step % 200 == 0: model_t.set_weights(model.get_weights())
class MADDPGCriticNetwork(object): def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr, obs_n_shape, act_shape_n, act_type, wd, agent_index): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.num_layers = num_hidden_layers self.lr = lr self.obs_shape_n = obs_n_shape self.act_shape_n = act_shape_n self.act_type = act_type self.clip_norm = 0.5 # self.optimizer = tf.keras.optimizers.Adam(lr=self.lr) self.optimizer = AdamW(learning_rate=lr, weight_decay=wd) self.no_neighbors = no_neighbors self.no_agents = len(self.obs_shape_n) self.no_features = self.obs_shape_n[0][0] self.no_actions = self.act_shape_n[0][0] # GAT self.k_lst = list(range(self.no_neighbors + 2))[2:] self.graph_input = tf.keras.layers.Input( (self.no_agents, self.no_features + self.no_actions), name="graph_input") self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj") self.gcn = GCNConv( units_per_layer, kernel_initializer=tf.keras.initializers.he_uniform(), activation=tf.keras.layers.LeakyReLU(alpha=0.1), use_bias=False)([self.graph_input, self.adj]) self.hidden_layers = [] for idx in range(2): layer = tf.keras.layers.Dense(units_per_layer, activation='relu') self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear') # Try ResNet Alternative # self.flatten = tf.keras.layers.Flatten()(self.gat) self.concat = tf.keras.layers.Concatenate(axis=2)( [self.graph_input, self.gcn]) self.flatten = tf.keras.layers.Flatten()(self.concat) x = self.flatten for idx in range(2): x = self.hidden_layers[idx](x) x = self.output_layer(x) # connect layers self.model = tf.keras.Model( inputs=[self.graph_input, self.adj], # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse') def predict(self, obs_n, act_n, adjacency): """ Predict the value of the input. Shapes: obs_n: (list no_agents, ndarray(batch_size, no_features)) act_n: (list no_agents, EagerTensor: batch_size, no_actions) """ concatenated_input = tf.concat([obs_n, act_n], axis=-1) concatenated_input = tf.transpose(concatenated_input, [1, 0, 2]) return self._predict_internal(concatenated_input, adjacency) # return self._predict_internal(obs_n + act_n) def _predict_internal(self, concatenated_input, adjacency): """ Internal function, because concatenation can not be done in tf.function """ # x = self.input_concat_layer(concatenated_input) # for idx in range(self.num_layers): # x = self.hidden_layers[idx](x) # x = self.output_layer(x) # return x x = self.model.predict([concatenated_input, adjacency]) return x def train_step(self, obs_n, act_n, adjacency, target_q): """ Train the critic network with the observations, actions, rewards and next observations, and next actions. """ # return self._train_step_internal(obs_n + act_n, target_q, weights) concatenated_input = np.concatenate([obs_n, act_n], axis=-1) concatenated_input = np.swapaxes(concatenated_input, 1, 0) return self._train_step_internal(concatenated_input, adjacency, target_q) @tf.function def _train_step_internal(self, concatenated_input, adjacency, target_q): """ Internal function, because concatenation can not be done inside tf.function """ with tf.GradientTape() as tape: q_pred = self.model([concatenated_input, adjacency], training=True) td_loss = tf.math.square(target_q - q_pred) loss = tf.reduce_mean(td_loss) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients( zip(local_clipped, self.model.trainable_variables)) return td_loss
class MADDPGCriticNetwork(object): def __init__(self, no_layers, units_per_layer, lr, obs_shape_n, act_shape_n, wd): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.lr = lr self.clip_norm = 0.5 self.optimizer = AdamW(learning_rate=lr, weight_decay=wd) self.no_layers = no_layers # set up layers # each agent's action and obs are treated as separate inputs self.obs_input_n = [] for idx, shape in enumerate(obs_shape_n): self.obs_input_n.append( tf.keras.layers.Input(shape=shape, name='obs_in' + str(idx))) self.act_input_n = [] for idx, shape in enumerate(act_shape_n): self.act_input_n.append( tf.keras.layers.Input(shape=shape, name='act_in' + str(idx))) self.input_concat_layer = tf.keras.layers.Concatenate() self.hidden_layers = [] for idx in range(self.no_layers): layer = tf.keras.layers.Dense(units_per_layer, activation='relu') self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear') x = self.input_concat_layer(self.obs_input_n + self.act_input_n) for idx in range(self.no_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) # connect layers self.model = tf.keras.Model( inputs=self.obs_input_n + self.act_input_n, # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse') def predict(self, obs_n, act_n): """ Predict the value of the input. """ return self._predict_internal(obs_n + act_n) @tf.function def _predict_internal(self, concatenated_input): x = self.input_concat_layer(concatenated_input) for idx in range(self.no_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) return x def train_step(self, obs_n, act_n, target_q): """ Train the critic network with the observations, actions, rewards and next observations, and next actions. """ return self._train_step_internal(obs_n + act_n, target_q) @tf.function def _train_step_internal(self, concatenated_input, target_q): """ Internal function, because concatenation can not be done inside tf.function """ with tf.GradientTape() as tape: x = self.input_concat_layer(concatenated_input) for idx in range(self.no_layers): x = self.hidden_layers[idx](x) q_pred = self.output_layer(x) td_loss = tf.math.square(target_q - q_pred) loss = tf.reduce_mean(td_loss) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = u.clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients( zip(local_clipped, self.model.trainable_variables)) return loss, td_loss def save(self, fp): self.model.save_weights(fp) def load(self, fp): self.model.load_weights(fp)
class MADDPGCriticNetwork(object): def __init__(self, no_layers, units_per_layer, lr, obs_shape_n, act_shape_n, no_neighbors=2, wd=0.0): """ Implementation of a critic to represent the Q-Values. Basically just a fully-connected regression ANN. """ self.lr = lr self.clip_norm = 0.5 self.optimizer = AdamW(learning_rate=lr, weight_decay=wd) self.no_layers = no_layers self.obs_shape_n = obs_shape_n # nd.array(no_agents --> no_features) self.act_shape_n = act_shape_n # nd.array(no_agents --> no_actions) self.no_agents = len(self.obs_shape_n) self.no_features = self.obs_shape_n[0][0] self.no_actions = self.act_shape_n[0][0] # GAT self.k_lst = list(range(no_neighbors + 2))[2:] self.graph_input = tf.keras.layers.Input((self.no_agents, self.no_features + self.no_actions), name="graph_input") self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj") # (2, (None, 15)) self.gat = GATConv( units_per_layer, activation='elu', attn_heads=2, concat_heads=True, )([self.graph_input, self.adj]) self.hidden_layers = [] for idx in range(self.no_layers): layer = tf.keras.layers.Dense(units_per_layer, activation='relu') self.hidden_layers.append(layer) self.output_layer = tf.keras.layers.Dense(1, activation='linear') self.flatten = keras.layers.Flatten()(self.gat) x = self.flatten for idx in range(self.no_layers): x = self.hidden_layers[idx](x) x = self.output_layer(x) # connect layers self.model = tf.keras.Model(inputs=[self.graph_input, self.adj], # list concatenation outputs=[x]) # tf.keras.utils.plot_model(self.model, show_shapes=True) self.model.compile(self.optimizer, loss='mse') def predict(self, obs_n, act_n, adjacency): """ Predict the value of the input. It should be graph: (batch_size, no_agents, features+no_actions) coming (no_agents, batch_size, features) adj: (batch_size, no_agents, no_agents) """ concatenated_input = tf.concat([obs_n, act_n], axis=-1) concatenated_input = tf.transpose(concatenated_input, [1, 0, 2]) return self._predict_internal(concatenated_input, adjacency) def _predict_internal(self, concatenated_input, adjacency): # x = self.gat()[concatenated_input, adjacency] # NOT WORKING # x = self.flatten(x) # for idx in range(self.no_layers): # x = self.hidden_layers[idx](x) # x = self.output_layer(x) x = self.model.predict([concatenated_input, adjacency]) return x def train_step(self, obs_n, act_n, adjacency, target_q): """ Train the critic network with the observations, actions, rewards and next observations, and next actions. """ concatenated_input = np.concatenate([obs_n, act_n], axis=-1) concatenated_input = np.swapaxes(concatenated_input, 1, 0) return self._train_step_internal(concatenated_input, adjacency, target_q) @tf.function def _train_step_internal(self, concatenated_input, adjacency, target_q): """ Internal function, because concatenation can not be done inside tf.function """ with tf.GradientTape() as tape: q_pred = self.model([concatenated_input, adjacency], training=True) td_loss = tf.math.square(target_q - q_pred) loss = tf.reduce_mean(td_loss) gradients = tape.gradient(loss, self.model.trainable_variables) local_clipped = u.clip_by_local_norm(gradients, self.clip_norm) self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables)) return loss, td_loss def save(self, fp): self.model.save_weights(fp) def load(self, fp): self.model.load_weights(fp)