def __init__(self,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 lr,
                 no_layers,
                 num_units,
                 tau,
                 noise=0.0,
                 use_ounoise=False,
                 logger=None):

        self.logger = logger
        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = u.space_n_to_shape_n(obs_space_n)
        act_shape_n = u.space_n_to_shape_n(act_space_n)
        act_type = type(act_space_n[0])
        self.policy = MADDPGPolicyNetwork(no_layers, num_units, lr,
                                          obs_shape_n,
                                          act_shape_n[agent_index], act_type,
                                          agent_index, noise, use_ounoise)
        self.policy_target = MADDPGPolicyNetwork(no_layers, num_units, lr,
                                                 obs_shape_n,
                                                 act_shape_n[agent_index],
                                                 act_type, agent_index,
                                                 use_ounoise, use_ounoise)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.agent_index = agent_index
        self.tau = tau
    def __init__(self,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 batch_size,
                 buff_size,
                 lr,
                 num_layer,
                 num_units,
                 gamma,
                 tau,
                 prioritized_replay=False,
                 alpha=0.6,
                 max_step=None,
                 initial_beta=0.6,
                 prioritized_replay_eps=1e-6,
                 _run=None):
        """
        An object containing critic, actor and training functions for Multi-Agent DDPG.
        """
        self._run = _run

        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = space_n_to_shape_n(obs_space_n)
        act_shape_n = space_n_to_shape_n(act_space_n)
        super().__init__(buff_size,
                         obs_shape_n,
                         act_shape_n,
                         batch_size,
                         prioritized_replay,
                         alpha,
                         max_step,
                         initial_beta,
                         prioritized_replay_eps=prioritized_replay_eps)

        act_type = type(act_space_n[0])
        self.critic = MADDPGCriticNetwork(num_layer, num_units, lr,
                                          obs_shape_n, act_shape_n, act_type,
                                          agent_index)
        self.critic_target = MADDPGCriticNetwork(num_layer, num_units, lr,
                                                 obs_shape_n, act_shape_n,
                                                 act_type, agent_index)
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        self.policy = MADDPGPolicyNetwork(num_layer, num_units, lr,
                                          obs_shape_n,
                                          act_shape_n[agent_index], act_type,
                                          1, self.critic, agent_index)
        self.policy_target = MADDPGPolicyNetwork(num_layer, num_units, lr,
                                                 obs_shape_n,
                                                 act_shape_n[agent_index],
                                                 act_type, 1, self.critic,
                                                 agent_index)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.batch_size = batch_size
        self.agent_index = agent_index
        self.decay = gamma
        self.tau = tau
    def __init__(self, no_neighbors, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer,
                 num_units, gamma,
                 tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6,
                 logger=None, history_size=0, noise=0.0, use_ounoise=False, temporal_mode='rnn'):
        """
        An object containing critic, actor and training functions for Multi-Agent DDPG.
        """
        self.logger = logger

        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = space_n_to_shape_n(obs_space_n)
        act_shape_n = space_n_to_shape_n(act_space_n)

        self.no_neighbors = no_neighbors
        self.no_agents = len(obs_shape_n)
        self.no_features = obs_shape_n[0][0]
        self.no_actions = obs_shape_n[0][0]
        self.k_lst = list(range(self.no_neighbors + 2))[2:]
        super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step,
                         initial_beta,  prioritized_replay_eps=prioritized_replay_eps, history_size=history_size)

        act_type = type(act_space_n[0])
        self.critic = MADDPGCriticNetwork(no_neighbors, num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type,
                                          agent_index)
        self.critic_target = MADDPGCriticNetwork(no_neighbors, num_layer, num_units, lr, obs_shape_n, act_shape_n,
                                                 act_type, agent_index)
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        self.policy = MADDPGPolicyNetwork(history_size, num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index],
                                          act_type, 1,
                                          self.critic, agent_index, noise, use_ounoise, temporal_mode)
        self.policy_target = MADDPGPolicyNetwork(history_size, num_layer, num_units, lr, obs_shape_n,
                                                 act_shape_n[agent_index],
                                                 act_type, 1, self.critic, agent_index, noise, use_ounoise, temporal_mode)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.batch_size = batch_size
        self.agent_index = agent_index
        self.decay = gamma
        self.tau = tau
def main():
    no_agents = arglist.no_agents
    u.create_seed(arglist.seed)
    env = make_env(arglist.scenario)
    logger = RLLogger(arglist.exp_name, env.n, env.n_adversaries,
                      arglist.save_rate, arglist)

    obs_shape_n = u.space_n_to_shape_n(env.observation_space)
    act_shape_n = u.space_n_to_shape_n(env.action_space)
    # Result paths
    model_path = os.path.join("results", arglist.exp_name, 'models')
    os.makedirs(model_path, exist_ok=True)

    critic = MADDPGCriticNetwork(arglist.no_layers,
                                 arglist.no_critic_neurons,
                                 arglist.lr,
                                 obs_shape_n,
                                 act_shape_n,
                                 wd=1e-5)
    critic_target = MADDPGCriticNetwork(arglist.no_layers,
                                        arglist.no_critic_neurons,
                                        arglist.lr,
                                        obs_shape_n,
                                        act_shape_n,
                                        wd=1e-5)
    critic_target.model.set_weights(critic.model.get_weights())

    agents = get_agents(env, arglist.lr, arglist.no_layers,
                        arglist.no_actor_neurons, arglist.tau, arglist.noise,
                        arglist.use_ounoise, logger)

    obs_n = env.reset()
    replay_buffer = EfficientReplayBuffer(int(arglist.max_buffer_size),
                                          no_agents, obs_shape_n,
                                          act_shape_n)  # Init Buffer
    # Load previous results if necessary
    if arglist.restore_fp:
        print('Loading previous state...')
        for ag_idx, agent in enumerate(agents):
            fp = os.path.join(model_path, 'agent_{}'.format(ag_idx))
            agent.load(fp)
        critic.load(model_path + '/critic.h5')
        critic_target.load(model_path + '/critic_target.h5')

    print('Starting iterations...')
    while True:
        logger.episode_step += 1
        action_n = [
            agent.action(obs.astype(np.float32)).numpy()
            for agent, obs in zip(agents, obs_n)
        ]
        new_obs_n, rew_n, done_n, _ = env.step(action_n)
        cooperative_reward = rew_n[0]
        terminal = (logger.episode_step >= arglist.max_episode_len)
        done = all(done_n) or terminal
        # collect experience
        replay_buffer.add(obs_n, action_n, cooperative_reward, new_obs_n, done)
        obs_n = new_obs_n

        if done:
            obs_n = env.reset()
            episode_step = 0
            logger.record_episode_end(agents, arglist.display)

        for ag_idx, rew in enumerate(rew_n):
            logger.cur_episode_reward += cooperative_reward
            logger.agent_rewards[ag_idx][-1] += cooperative_reward

        logger.train_step += 1
        train_cond = not arglist.display

        if train_cond and len(replay_buffer) > arglist.batch_size:
            if len(
                    logger.episode_rewards
            ) % arglist.update_rate == 0:  # only update every 30 episodes
                for _ in range(arglist.update_times):
                    # Sample: Shapes --> (no-agents, batch_size, features)
                    state, actions, rewards, new_state, dones = replay_buffer.sample(
                        arglist.batch_size)
                    target_act_next = [
                        a.target_action(obs)
                        for a, obs in zip(agents, new_state)
                    ]
                    target_q_next = critic_target.predict(
                        new_state, target_act_next)
                    q_train_target = rewards + (
                        1. - dones) * arglist.gamma * target_q_next

                    loss, td_loss = critic.train_step(state, actions,
                                                      q_train_target)
                    logger.save_logger("critic_loss", np.mean(td_loss),
                                       logger.train_step, 0)
                    update_target_networks(critic, critic_target)
                    critic.save(model_path + '/critic.h5')
                    critic_target.save(model_path + '/critic_target.h5')
                    for agent in agents:
                        pol_loss = agent.update(state, actions, critic,
                                                logger.train_step)

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()

        # saves logger outputs to a file similar to the way in the original MADDPG implementation
        if len(logger.episode_rewards) > arglist.no_episodes:
            logger.experiment_end()
            return logger.get_sacred_results()
Example #5
0
def main(arglist):
    global no_actions, no_features, no_agents
    env = u.make_env(arglist.scenario, arglist.no_agents)

    obs_shape_n = env.observation_space
    act_shape_n = env.action_space
    act_shape_n = u.space_n_to_shape_n(act_shape_n)
    no_agents = env.n
    batch_size = arglist.batch_size
    no_neighbors = arglist.no_neighbors
    k_lst = list(range(no_neighbors + 2))[2:]  # [2,3]
    u.create_seed(arglist.seed)

    noise_mode = OUNoise(act_shape_n[0], scale=1.0)
    noise = 0.1
    reduction_noise = 0.999
    # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9
    no_features = obs_shape_n[0].shape[0]
    no_actions = act_shape_n[0][0]

    model, model_t = __build_conf()
    optimizer = AdamW(learning_rate=arglist.lr, weight_decay=1e-5)

    # Results
    episode_rewards = [0.0]  # sum of rewards for all agents
    result_path = os.path.join("results", arglist.exp_name)
    res = os.path.join(result_path, " %s.csv" % arglist.exp_name)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    replay_buffer = ReplayBuffer(arglist.max_buffer_size)  # Init Buffer
    episode_step = 0
    train_step = 0

    t_start = time.time()
    obs_n = env.reset()
    adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

    print('Starting iterations...')
    while True:
        episode_step += 1
        terminal = (episode_step >= arglist.max_episode_len)
        if episode_step % 3 == 0:
            adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

        predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj, model)
        actions = get_actions(predictions, noise, noise_mode)
        # Observe next state, reward and done value
        new_obs_n, rew_n, done_n, _ = env.step(actions)
        done = all(done_n) or terminal
        cooperative_reward = rew_n[0]
        # Store the data in the replay memory
        replay_buffer.add(obs_n, adj, actions, cooperative_reward, new_obs_n,
                          done)
        obs_n = new_obs_n

        episode_rewards[-1] += cooperative_reward

        if done or terminal:
            obs_n = env.reset()
            episode_step = 0
            episode_rewards.append(0)

        # increment global step counter
        train_step += 1

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()
            continue

        # Train the models
        train_cond = not arglist.display
        if train_cond and len(replay_buffer) > arglist.batch_size:
            if len(
                    episode_rewards
            ) % arglist.update_rate == 0:  # only update every 30 episodes
                for _ in range(arglist.update_times):
                    state, adj_n, actions, rewards, new_state, dones = replay_buffer.sample(
                        batch_size)
                    noise *= reduction_noise

                    # Calculate TD-target
                    with tf.GradientTape() as tape:
                        target_q_values = model_t([new_state, adj_n])
                        # Apply max(Q) to obtain the TD-target
                        target_q_tot = tf.reduce_max(target_q_values, axis=-1)
                        # Apply VDN to reduce the agent-dimension
                        max_q_tot = tf.reduce_sum(target_q_tot, axis=-1)
                        y = rewards + (1. - dones) * arglist.gamma * max_q_tot

                        # Predictions
                        action_one_hot = tf.one_hot(
                            tf.argmax(actions, axis=2, name='action_one_hot'),
                            no_actions)
                        q_values = model([state, adj_n])
                        q_tot = tf.reduce_sum(q_values * action_one_hot,
                                              axis=-1,
                                              name='q_acted')
                        pred = tf.reduce_sum(q_tot, axis=1)
                        if "huber" in arglist.loss_type:
                            loss = tf.reduce_sum(
                                u.huber_loss(pred, tf.stop_gradient(y)))
                        elif "mse" in arglist.loss_type:
                            loss = tf.losses.mean_squared_error(
                                pred, tf.stop_gradient(y))
                        else:
                            raise RuntimeError(
                                "Loss function should be either Huber or MSE. %s found!"
                                % arglist.loss_type)

                        gradients = tape.gradient(loss,
                                                  model.trainable_variables)
                        local_clipped = u.clip_by_local_norm(gradients, 0.1)
                    optimizer.apply_gradients(
                        zip(local_clipped, model.trainable_variables))
                    tf.saved_model.save(model, result_path)

            # display training output
            if train_step % arglist.save_rate == 0:
                # eval_reward = get_eval_reward(env, model)
                with open(res, "a+") as f:
                    mes_dict = {
                        "steps":
                        train_step,
                        "episodes":
                        len(episode_rewards),
                        "train_episode_reward":
                        np.round(np.mean(episode_rewards[-arglist.save_rate:]),
                                 3),
                        # "eval_episode_reward": np.round(np.mean(eval_reward), 3),
                        "time":
                        round(time.time() - t_start, 3)
                    }
                    print(mes_dict)
                    for item in list(mes_dict.values()):
                        f.write("%s\t" % item)
                    f.write("\n")
                    f.close()
                t_start = time.time()

        # train target model
        if arglist.soft_update:
            weights = model.get_weights()
            target_weights = model_t.get_weights()

            for w in range(len(weights)):
                target_weights[w] = arglist.tau * weights[w] + (
                    1 - arglist.tau) * target_weights[w]
            model_t.set_weights(target_weights)
        elif terminal and train_step % 200 == 0:
            model_t.set_weights(model.get_weights())