def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr,
                 obs_n_shape, act_shape_n, act_type, wd, agent_index):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.num_layers = num_hidden_layers
        self.lr = lr
        self.obs_shape_n = obs_n_shape
        self.act_shape_n = act_shape_n
        self.act_type = act_type

        self.clip_norm = 0.5
        # self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.optimizer = AdamW(learning_rate=lr, weight_decay=wd)
        self.no_neighbors = no_neighbors
        self.no_agents = len(self.obs_shape_n)
        self.no_features = self.obs_shape_n[0][0]
        self.no_actions = self.act_shape_n[0][0]
        # GAT
        self.k_lst = list(range(self.no_neighbors + 2))[2:]

        self.graph_input = tf.keras.layers.Input(
            (self.no_agents, self.no_features + self.no_actions),
            name="graph_input")
        self.adj = tf.keras.layers.Input(shape=(self.no_agents,
                                                self.no_agents),
                                         name="adj")

        self.gcn = GCNConv(
            units_per_layer,
            kernel_initializer=tf.keras.initializers.he_uniform(),
            activation=tf.keras.layers.LeakyReLU(alpha=0.1),
            use_bias=False)([self.graph_input, self.adj])

        self.hidden_layers = []
        for idx in range(2):
            layer = tf.keras.layers.Dense(units_per_layer, activation='relu')
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1, activation='linear')

        # Try ResNet Alternative
        # self.flatten = tf.keras.layers.Flatten()(self.gat)
        self.concat = tf.keras.layers.Concatenate(axis=2)(
            [self.graph_input, self.gcn])
        self.flatten = tf.keras.layers.Flatten()(self.concat)

        x = self.flatten
        for idx in range(2):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        # connect layers
        self.model = tf.keras.Model(
            inputs=[self.graph_input, self.adj],  # list concatenation
            outputs=[x])

        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')
Exemple #2
0
    def __init__(self, num_hidden_layers, units_per_layer, lr, obs_n_shape,
                 act_shape_n, act_type, agent_index):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.num_layers = num_hidden_layers
        self.lr = lr
        self.obs_shape_n = obs_n_shape
        self.act_shape_n = act_shape_n
        self.act_type = act_type

        self.clip_norm = 0.5
        self.wd = 1e-5
        self.optimizer = AdamW(learning_rate=lr, weight_decay=self.wd)

        # set up layers
        # each agent's action and obs are treated as separate inputs
        self.obs_input_n = []
        for idx, shape in enumerate(self.obs_shape_n):
            self.obs_input_n.append(
                tf.keras.layers.Input(shape=shape, name='obs_in' + str(idx)))

        self.act_input_n = []
        for idx, shape in enumerate(self.act_shape_n):
            self.act_input_n.append(
                tf.keras.layers.Input(shape=shape, name='act_in' + str(idx)))

        self.input_concat_layer = tf.keras.layers.Concatenate()

        self.hidden_layers = []
        for idx in range(num_hidden_layers):
            layer = tf.keras.layers.Dense(units_per_layer,
                                          activation='relu',
                                          name='ag{}crit_hid{}'.format(
                                              agent_index, idx))
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1,
                                                  activation='linear',
                                                  name='ag{}crit_out{}'.format(
                                                      agent_index, idx))

        # connect layers
        x = self.input_concat_layer(self.obs_input_n + self.act_input_n)
        for idx in range(self.num_layers):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        self.model = tf.keras.Model(
            inputs=self.obs_input_n + self.act_input_n,  # list concatenation
            outputs=[x])
        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')
    def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr, obs_n_shape, act_shape_n, act_type,
                 agent_index):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.num_layers = num_hidden_layers
        self.lr = lr
        self.obs_shape_n = obs_n_shape
        self.act_shape_n = act_shape_n
        self.act_type = act_type

        self.clip_norm = 0.5
        self.wd = 1e-5
        self.optimizer = AdamW(learning_rate=lr, weight_decay=self.wd)

        self.no_neighbors = no_neighbors
        self.no_agents = len(self.obs_shape_n)
        self.no_features = self.obs_shape_n[0][0]
        self.no_actions = self.act_shape_n[0][0]
        # GAT
        self.k_lst = list(range(self.no_neighbors + 2))[2:]

        self.graph_input = tf.keras.layers.Input((self.no_agents, self.no_features + self.no_actions),
                                                 name="graph_input")
        self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj")
        # (2, (None, 15))

        self.gat = GATConv(
            units_per_layer,
            activation='elu',
            attn_heads=2,
            concat_heads=True,
        )([self.graph_input, self.adj])

        self.hidden_layers = []
        for idx in range(2):
            layer = tf.keras.layers.Dense(units_per_layer, activation='relu')
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1, activation='linear')
        self.flatten = tf.keras.layers.Flatten()(self.gat)
        x = self.flatten
        for idx in range(2):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        # connect layers
        self.model = tf.keras.Model(inputs=[self.graph_input, self.adj],  # list concatenation
                                    outputs=[x])

        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')
Exemple #4
0
def main(arglist):
    global no_actions, no_features, no_agents
    env = u.make_env(arglist.scenario, arglist.no_agents)

    obs_shape_n = env.observation_space
    act_shape_n = env.action_space
    act_shape_n = u.space_n_to_shape_n(act_shape_n)
    no_agents = env.n
    batch_size = arglist.batch_size
    no_neighbors = arglist.no_neighbors
    k_lst = list(range(no_neighbors + 2))[2:]  # [2,3]
    u.create_seed(arglist.seed)

    noise_mode = OUNoise(act_shape_n[0], scale=1.0)
    noise = 0.1
    reduction_noise = 0.999
    # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9
    no_features = obs_shape_n[0].shape[0]
    no_actions = act_shape_n[0][0]

    model, model_t = __build_conf()
    optimizer = AdamW(learning_rate=arglist.lr, weight_decay=1e-5)

    # Results
    episode_rewards = [0.0]  # sum of rewards for all agents
    result_path = os.path.join("results", arglist.exp_name)
    res = os.path.join(result_path, " %s.csv" % arglist.exp_name)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    replay_buffer = ReplayBuffer(arglist.max_buffer_size)  # Init Buffer
    episode_step = 0
    train_step = 0

    t_start = time.time()
    obs_n = env.reset()
    adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

    print('Starting iterations...')
    while True:
        episode_step += 1
        terminal = (episode_step >= arglist.max_episode_len)
        if episode_step % 3 == 0:
            adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

        predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj, model)
        actions = get_actions(predictions, noise, noise_mode)
        # Observe next state, reward and done value
        new_obs_n, rew_n, done_n, _ = env.step(actions)
        done = all(done_n) or terminal
        cooperative_reward = rew_n[0]
        # Store the data in the replay memory
        replay_buffer.add(obs_n, adj, actions, cooperative_reward, new_obs_n,
                          done)
        obs_n = new_obs_n

        episode_rewards[-1] += cooperative_reward

        if done or terminal:
            obs_n = env.reset()
            episode_step = 0
            episode_rewards.append(0)

        # increment global step counter
        train_step += 1

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()
            continue

        # Train the models
        train_cond = not arglist.display
        if train_cond and len(replay_buffer) > arglist.batch_size:
            if len(
                    episode_rewards
            ) % arglist.update_rate == 0:  # only update every 30 episodes
                for _ in range(arglist.update_times):
                    state, adj_n, actions, rewards, new_state, dones = replay_buffer.sample(
                        batch_size)
                    noise *= reduction_noise

                    # Calculate TD-target
                    with tf.GradientTape() as tape:
                        target_q_values = model_t([new_state, adj_n])
                        # Apply max(Q) to obtain the TD-target
                        target_q_tot = tf.reduce_max(target_q_values, axis=-1)
                        # Apply VDN to reduce the agent-dimension
                        max_q_tot = tf.reduce_sum(target_q_tot, axis=-1)
                        y = rewards + (1. - dones) * arglist.gamma * max_q_tot

                        # Predictions
                        action_one_hot = tf.one_hot(
                            tf.argmax(actions, axis=2, name='action_one_hot'),
                            no_actions)
                        q_values = model([state, adj_n])
                        q_tot = tf.reduce_sum(q_values * action_one_hot,
                                              axis=-1,
                                              name='q_acted')
                        pred = tf.reduce_sum(q_tot, axis=1)
                        if "huber" in arglist.loss_type:
                            loss = tf.reduce_sum(
                                u.huber_loss(pred, tf.stop_gradient(y)))
                        elif "mse" in arglist.loss_type:
                            loss = tf.losses.mean_squared_error(
                                pred, tf.stop_gradient(y))
                        else:
                            raise RuntimeError(
                                "Loss function should be either Huber or MSE. %s found!"
                                % arglist.loss_type)

                        gradients = tape.gradient(loss,
                                                  model.trainable_variables)
                        local_clipped = u.clip_by_local_norm(gradients, 0.1)
                    optimizer.apply_gradients(
                        zip(local_clipped, model.trainable_variables))
                    tf.saved_model.save(model, result_path)

            # display training output
            if train_step % arglist.save_rate == 0:
                # eval_reward = get_eval_reward(env, model)
                with open(res, "a+") as f:
                    mes_dict = {
                        "steps":
                        train_step,
                        "episodes":
                        len(episode_rewards),
                        "train_episode_reward":
                        np.round(np.mean(episode_rewards[-arglist.save_rate:]),
                                 3),
                        # "eval_episode_reward": np.round(np.mean(eval_reward), 3),
                        "time":
                        round(time.time() - t_start, 3)
                    }
                    print(mes_dict)
                    for item in list(mes_dict.values()):
                        f.write("%s\t" % item)
                    f.write("\n")
                    f.close()
                t_start = time.time()

        # train target model
        if arglist.soft_update:
            weights = model.get_weights()
            target_weights = model_t.get_weights()

            for w in range(len(weights)):
                target_weights[w] = arglist.tau * weights[w] + (
                    1 - arglist.tau) * target_weights[w]
            model_t.set_weights(target_weights)
        elif terminal and train_step % 200 == 0:
            model_t.set_weights(model.get_weights())
class MADDPGCriticNetwork(object):
    def __init__(self, no_neighbors, num_hidden_layers, units_per_layer, lr,
                 obs_n_shape, act_shape_n, act_type, wd, agent_index):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.num_layers = num_hidden_layers
        self.lr = lr
        self.obs_shape_n = obs_n_shape
        self.act_shape_n = act_shape_n
        self.act_type = act_type

        self.clip_norm = 0.5
        # self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
        self.optimizer = AdamW(learning_rate=lr, weight_decay=wd)
        self.no_neighbors = no_neighbors
        self.no_agents = len(self.obs_shape_n)
        self.no_features = self.obs_shape_n[0][0]
        self.no_actions = self.act_shape_n[0][0]
        # GAT
        self.k_lst = list(range(self.no_neighbors + 2))[2:]

        self.graph_input = tf.keras.layers.Input(
            (self.no_agents, self.no_features + self.no_actions),
            name="graph_input")
        self.adj = tf.keras.layers.Input(shape=(self.no_agents,
                                                self.no_agents),
                                         name="adj")

        self.gcn = GCNConv(
            units_per_layer,
            kernel_initializer=tf.keras.initializers.he_uniform(),
            activation=tf.keras.layers.LeakyReLU(alpha=0.1),
            use_bias=False)([self.graph_input, self.adj])

        self.hidden_layers = []
        for idx in range(2):
            layer = tf.keras.layers.Dense(units_per_layer, activation='relu')
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1, activation='linear')

        # Try ResNet Alternative
        # self.flatten = tf.keras.layers.Flatten()(self.gat)
        self.concat = tf.keras.layers.Concatenate(axis=2)(
            [self.graph_input, self.gcn])
        self.flatten = tf.keras.layers.Flatten()(self.concat)

        x = self.flatten
        for idx in range(2):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        # connect layers
        self.model = tf.keras.Model(
            inputs=[self.graph_input, self.adj],  # list concatenation
            outputs=[x])

        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')

    def predict(self, obs_n, act_n, adjacency):
        """
        Predict the value of the input.
        Shapes:
        obs_n: (list no_agents, ndarray(batch_size, no_features))
        act_n: (list no_agents, EagerTensor: batch_size, no_actions)
        """
        concatenated_input = tf.concat([obs_n, act_n], axis=-1)
        concatenated_input = tf.transpose(concatenated_input, [1, 0, 2])
        return self._predict_internal(concatenated_input, adjacency)
        # return self._predict_internal(obs_n + act_n)

    def _predict_internal(self, concatenated_input, adjacency):
        """
        Internal function, because concatenation can not be done in tf.function
        """
        # x = self.input_concat_layer(concatenated_input)
        # for idx in range(self.num_layers):
        #     x = self.hidden_layers[idx](x)
        # x = self.output_layer(x)
        # return x
        x = self.model.predict([concatenated_input, adjacency])
        return x

    def train_step(self, obs_n, act_n, adjacency, target_q):
        """
        Train the critic network with the observations, actions, rewards and next observations, and next actions.
        """
        # return self._train_step_internal(obs_n + act_n, target_q, weights)
        concatenated_input = np.concatenate([obs_n, act_n], axis=-1)
        concatenated_input = np.swapaxes(concatenated_input, 1, 0)
        return self._train_step_internal(concatenated_input, adjacency,
                                         target_q)

    @tf.function
    def _train_step_internal(self, concatenated_input, adjacency, target_q):
        """
        Internal function, because concatenation can not be done inside tf.function
        """
        with tf.GradientTape() as tape:
            q_pred = self.model([concatenated_input, adjacency], training=True)
            td_loss = tf.math.square(target_q - q_pred)
            loss = tf.reduce_mean(td_loss)
        gradients = tape.gradient(loss, self.model.trainable_variables)
        local_clipped = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(
            zip(local_clipped, self.model.trainable_variables))
        return td_loss
class MADDPGCriticNetwork(object):
    def __init__(self, no_layers, units_per_layer, lr, obs_shape_n,
                 act_shape_n, wd):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.lr = lr
        self.clip_norm = 0.5
        self.optimizer = AdamW(learning_rate=lr, weight_decay=wd)
        self.no_layers = no_layers
        # set up layers
        # each agent's action and obs are treated as separate inputs
        self.obs_input_n = []
        for idx, shape in enumerate(obs_shape_n):
            self.obs_input_n.append(
                tf.keras.layers.Input(shape=shape, name='obs_in' + str(idx)))

        self.act_input_n = []
        for idx, shape in enumerate(act_shape_n):
            self.act_input_n.append(
                tf.keras.layers.Input(shape=shape, name='act_in' + str(idx)))

        self.input_concat_layer = tf.keras.layers.Concatenate()

        self.hidden_layers = []
        for idx in range(self.no_layers):
            layer = tf.keras.layers.Dense(units_per_layer, activation='relu')
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1, activation='linear')

        x = self.input_concat_layer(self.obs_input_n + self.act_input_n)
        for idx in range(self.no_layers):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        # connect layers
        self.model = tf.keras.Model(
            inputs=self.obs_input_n + self.act_input_n,  # list concatenation
            outputs=[x])

        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')

    def predict(self, obs_n, act_n):
        """
        Predict the value of the input.
        """
        return self._predict_internal(obs_n + act_n)

    @tf.function
    def _predict_internal(self, concatenated_input):
        x = self.input_concat_layer(concatenated_input)
        for idx in range(self.no_layers):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)
        return x

    def train_step(self, obs_n, act_n, target_q):
        """
        Train the critic network with the observations, actions, rewards and next observations, and next actions.
        """
        return self._train_step_internal(obs_n + act_n, target_q)

    @tf.function
    def _train_step_internal(self, concatenated_input, target_q):
        """
        Internal function, because concatenation can not be done inside tf.function
        """
        with tf.GradientTape() as tape:
            x = self.input_concat_layer(concatenated_input)
            for idx in range(self.no_layers):
                x = self.hidden_layers[idx](x)
            q_pred = self.output_layer(x)
            td_loss = tf.math.square(target_q - q_pred)
            loss = tf.reduce_mean(td_loss)

        gradients = tape.gradient(loss, self.model.trainable_variables)

        local_clipped = u.clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(
            zip(local_clipped, self.model.trainable_variables))

        return loss, td_loss

    def save(self, fp):
        self.model.save_weights(fp)

    def load(self, fp):
        self.model.load_weights(fp)
Exemple #7
0
class MADDPGCriticNetwork(object):
    def __init__(self, no_layers, units_per_layer, lr, obs_shape_n, act_shape_n, no_neighbors=2, wd=0.0):
        """
        Implementation of a critic to represent the Q-Values. Basically just a fully-connected
        regression ANN.
        """
        self.lr = lr
        self.clip_norm = 0.5
        self.optimizer = AdamW(learning_rate=lr, weight_decay=wd)
        self.no_layers = no_layers
        self.obs_shape_n = obs_shape_n  # nd.array(no_agents --> no_features)
        self.act_shape_n = act_shape_n  # nd.array(no_agents --> no_actions)
        self.no_agents = len(self.obs_shape_n)
        self.no_features = self.obs_shape_n[0][0]
        self.no_actions = self.act_shape_n[0][0]
        # GAT
        self.k_lst = list(range(no_neighbors + 2))[2:]

        self.graph_input = tf.keras.layers.Input((self.no_agents, self.no_features + self.no_actions),
                                                 name="graph_input")
        self.adj = tf.keras.layers.Input(shape=(self.no_agents, self.no_agents), name="adj")
        # (2, (None, 15))
        self.gat = GATConv(
            units_per_layer,
            activation='elu',
            attn_heads=2,
            concat_heads=True,
        )([self.graph_input, self.adj])

        self.hidden_layers = []
        for idx in range(self.no_layers):
            layer = tf.keras.layers.Dense(units_per_layer, activation='relu')
            self.hidden_layers.append(layer)

        self.output_layer = tf.keras.layers.Dense(1, activation='linear')
        self.flatten = keras.layers.Flatten()(self.gat)
        x = self.flatten
        for idx in range(self.no_layers):
            x = self.hidden_layers[idx](x)
        x = self.output_layer(x)

        # connect layers
        self.model = tf.keras.Model(inputs=[self.graph_input, self.adj],  # list concatenation
                                    outputs=[x])

        # tf.keras.utils.plot_model(self.model, show_shapes=True)
        self.model.compile(self.optimizer, loss='mse')

    def predict(self, obs_n, act_n, adjacency):
        """
        Predict the value of the input.
        It should be
        graph: (batch_size, no_agents, features+no_actions) coming (no_agents, batch_size, features)
        adj: (batch_size, no_agents, no_agents)
        """
        concatenated_input = tf.concat([obs_n, act_n], axis=-1)
        concatenated_input = tf.transpose(concatenated_input, [1, 0, 2])
        return self._predict_internal(concatenated_input, adjacency)

    def _predict_internal(self, concatenated_input, adjacency):
        # x = self.gat()[concatenated_input, adjacency] # NOT WORKING
        # x = self.flatten(x)
        # for idx in range(self.no_layers):
        #     x = self.hidden_layers[idx](x)
        # x = self.output_layer(x)
        x = self.model.predict([concatenated_input, adjacency])
        return x

    def train_step(self, obs_n, act_n, adjacency, target_q):
        """
        Train the critic network with the observations, actions, rewards and next observations, and next actions.
        """
        concatenated_input = np.concatenate([obs_n, act_n], axis=-1)
        concatenated_input = np.swapaxes(concatenated_input, 1, 0)
        return self._train_step_internal(concatenated_input, adjacency, target_q)

    @tf.function
    def _train_step_internal(self, concatenated_input, adjacency, target_q):
        """
        Internal function, because concatenation can not be done inside tf.function
        """
        with tf.GradientTape() as tape:
            q_pred = self.model([concatenated_input, adjacency], training=True)
            td_loss = tf.math.square(target_q - q_pred)
            loss = tf.reduce_mean(td_loss)

        gradients = tape.gradient(loss, self.model.trainable_variables)
        local_clipped = u.clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables))

        return loss, td_loss

    def save(self, fp):
        self.model.save_weights(fp)

    def load(self, fp):
        self.model.load_weights(fp)