Exemple #1
0
def get_eval_reward(env, model):
    reward_total = []
    for _ in range(3):
        obs_n = env.reset()
        reward = 0
        for i in range(arglist.max_episode_len):
            predictions = get_predictions(u.to_tensor(np.array(obs_n)), model)
            predictions = tf.squeeze(predictions, axis=0)
            # Observe next state, reward and done value
            new_obs_n, rew_n, done_n, _ = env.step(predictions.numpy())
            obs_n = new_obs_n
            reward += rew_n[0]
            if all(done_n):
                break
        reward_total.append(reward)
    return reward_total
Exemple #2
0
def get_eval_reward(env, model):
    k_lst = list(range(arglist.no_neighbors + 2))[2:]  # [2,3]
    reward_total = []
    for _ in range(3):
        obs_n = env.reset()
        adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)
        reward = 0
        for i in range(arglist.max_episode_len):
            predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj,
                                          model)
            predictions = tf.squeeze(predictions, axis=0)
            # actions = [tf.argmax(prediction, axis=-1).numpy() for prediction in predictions]

            # Observe next state, reward and done value
            new_obs_n, rew_n, done_n, _ = env.step(predictions.numpy())
            adj = u.get_adj(new_obs_n, k_lst, no_agents, is_gcn=True)
            obs_n = new_obs_n
            reward += rew_n[0]
        reward_total.append(reward)
    return reward_total
Exemple #3
0
def get_eval_reward(env, model):
    reward_total = []
    for _ in range(3):
        obs_n = env.reset()
        obs_n = u.reshape_state(obs_n, arglist.history_size)
        reward = 0
        for i in range(arglist.max_episode_len):
            predictions = get_predictions(u.to_tensor(np.array(obs_n)), model)
            predictions = tf.squeeze(predictions, axis=0)
            actions = [
                tf.argmax(prediction, axis=-1).numpy()
                for prediction in predictions
            ]

            # Observe next state, reward and done value
            new_obs_n, rew_n, done_n, _ = env.step(actions)
            new_obs_n = u.refresh_history(np.copy(obs_n), new_obs_n)
            obs_n = new_obs_n
            reward += rew_n[0]
        reward_total.append(reward)
    return reward_total
def main(arglist):
    # Global variables
    global num_actions, feature_dim, no_agents
    # Create environment
    env = u.make_env(arglist.scenario, no_agents=arglist.no_agents)
    env.discrete_action_input = True

    obs_shape_n = env.observation_space
    no_agents = env.n
    no_neighbors = arglist.num_neighbors
    u.create_seed(3)
    k_lst = list(range(no_neighbors + 2))[2:]  # [2,3]

    # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9
    num_features = obs_shape_n[0].shape[0]
    num_actions = env.action_space[0].n
    feature_dim = num_features  # the size of node features
    result_path = "results/mixing/" + arglist.exp_name
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    model = keras.models.load_model(result_path)
    res = "asymptotic/" + arglist.exp_name
    if not os.path.exists(res):
        os.makedirs(res)
    res = "asymptotic/" + arglist.exp_name + "/testing_rewards.csv"

    episode_rewards = []

    while True:
        obs_n = env.reset()
        if arglist.use_gat or arglist.use_gcn:
            adj = u.get_adj(obs_n,
                            k_lst,
                            no_agents,
                            is_gat=arglist.use_gat,
                            is_gcn=arglist.use_gcn)
        else:
            adj = None
        if arglist.use_rnn:
            obs_n = u.reshape_state(obs_n, arglist.history_size)
        episode_rewards.append(0)
        for i in range(arglist.max_episode_len):
            predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj,
                                          model)
            predictions = tf.squeeze(predictions, axis=0)
            # print("predictions: %s" % tf.shape(predictions))

            actions = [
                tf.argmax(prediction, axis=-1).numpy()
                for prediction in predictions
            ]

            # Observe next state, reward and done value
            new_obs_n, rew_n, done_n, _ = env.step(actions)
            if arglist.use_gat or arglist.use_gcn:
                adj = u.get_adj(obs_n,
                                k_lst,
                                no_agents,
                                is_gat=arglist.use_gat,
                                is_gcn=arglist.use_gcn)
            if arglist.use_rnn:
                new_obs_n = u.refresh_history(np.copy(obs_n), new_obs_n)
            obs_n = new_obs_n
            episode_rewards[-1] += sum(rew_n)

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.5)
                # print("Reward is %.3f" % sum(rew_n))
                env.render()
                continue

        with open(res, "a+") as f:
            mes_dict = {
                "episodes": len(episode_rewards),
                "train_episode_reward": np.round(np.mean(episode_rewards[-1]),
                                                 3)
            }
            print(mes_dict)
            for item in list(mes_dict.values()):
                f.write("%s\t" % item)
            f.write("\n")
            f.close()
Exemple #5
0
def main(arglist):
    global no_actions, no_features, no_agents
    env = u.make_env(arglist.scenario, arglist.no_agents)

    obs_shape_n = env.observation_space
    act_shape_n = env.action_space
    act_shape_n = u.space_n_to_shape_n(act_shape_n)
    no_agents = env.n
    batch_size = arglist.batch_size
    no_neighbors = arglist.no_neighbors
    k_lst = list(range(no_neighbors + 2))[2:]  # [2,3]
    u.create_seed(arglist.seed)

    noise_mode = OUNoise(act_shape_n[0], scale=1.0)
    noise = 0.1
    reduction_noise = 0.999
    # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9
    no_features = obs_shape_n[0].shape[0]
    no_actions = act_shape_n[0][0]

    model, model_t = __build_conf()
    optimizer = AdamW(learning_rate=arglist.lr, weight_decay=1e-5)

    # Results
    episode_rewards = [0.0]  # sum of rewards for all agents
    result_path = os.path.join("results", arglist.exp_name)
    res = os.path.join(result_path, " %s.csv" % arglist.exp_name)
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    replay_buffer = ReplayBuffer(arglist.max_buffer_size)  # Init Buffer
    episode_step = 0
    train_step = 0

    t_start = time.time()
    obs_n = env.reset()
    adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

    print('Starting iterations...')
    while True:
        episode_step += 1
        terminal = (episode_step >= arglist.max_episode_len)
        if episode_step % 3 == 0:
            adj = u.get_adj(obs_n, k_lst, no_agents, is_gcn=True)

        predictions = get_predictions(u.to_tensor(np.array(obs_n)), adj, model)
        actions = get_actions(predictions, noise, noise_mode)
        # Observe next state, reward and done value
        new_obs_n, rew_n, done_n, _ = env.step(actions)
        done = all(done_n) or terminal
        cooperative_reward = rew_n[0]
        # Store the data in the replay memory
        replay_buffer.add(obs_n, adj, actions, cooperative_reward, new_obs_n,
                          done)
        obs_n = new_obs_n

        episode_rewards[-1] += cooperative_reward

        if done or terminal:
            obs_n = env.reset()
            episode_step = 0
            episode_rewards.append(0)

        # increment global step counter
        train_step += 1

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()
            continue

        # Train the models
        train_cond = not arglist.display
        if train_cond and len(replay_buffer) > arglist.batch_size:
            if len(
                    episode_rewards
            ) % arglist.update_rate == 0:  # only update every 30 episodes
                for _ in range(arglist.update_times):
                    state, adj_n, actions, rewards, new_state, dones = replay_buffer.sample(
                        batch_size)
                    noise *= reduction_noise

                    # Calculate TD-target
                    with tf.GradientTape() as tape:
                        target_q_values = model_t([new_state, adj_n])
                        # Apply max(Q) to obtain the TD-target
                        target_q_tot = tf.reduce_max(target_q_values, axis=-1)
                        # Apply VDN to reduce the agent-dimension
                        max_q_tot = tf.reduce_sum(target_q_tot, axis=-1)
                        y = rewards + (1. - dones) * arglist.gamma * max_q_tot

                        # Predictions
                        action_one_hot = tf.one_hot(
                            tf.argmax(actions, axis=2, name='action_one_hot'),
                            no_actions)
                        q_values = model([state, adj_n])
                        q_tot = tf.reduce_sum(q_values * action_one_hot,
                                              axis=-1,
                                              name='q_acted')
                        pred = tf.reduce_sum(q_tot, axis=1)
                        if "huber" in arglist.loss_type:
                            loss = tf.reduce_sum(
                                u.huber_loss(pred, tf.stop_gradient(y)))
                        elif "mse" in arglist.loss_type:
                            loss = tf.losses.mean_squared_error(
                                pred, tf.stop_gradient(y))
                        else:
                            raise RuntimeError(
                                "Loss function should be either Huber or MSE. %s found!"
                                % arglist.loss_type)

                        gradients = tape.gradient(loss,
                                                  model.trainable_variables)
                        local_clipped = u.clip_by_local_norm(gradients, 0.1)
                    optimizer.apply_gradients(
                        zip(local_clipped, model.trainable_variables))
                    tf.saved_model.save(model, result_path)

            # display training output
            if train_step % arglist.save_rate == 0:
                # eval_reward = get_eval_reward(env, model)
                with open(res, "a+") as f:
                    mes_dict = {
                        "steps":
                        train_step,
                        "episodes":
                        len(episode_rewards),
                        "train_episode_reward":
                        np.round(np.mean(episode_rewards[-arglist.save_rate:]),
                                 3),
                        # "eval_episode_reward": np.round(np.mean(eval_reward), 3),
                        "time":
                        round(time.time() - t_start, 3)
                    }
                    print(mes_dict)
                    for item in list(mes_dict.values()):
                        f.write("%s\t" % item)
                    f.write("\n")
                    f.close()
                t_start = time.time()

        # train target model
        if arglist.soft_update:
            weights = model.get_weights()
            target_weights = model_t.get_weights()

            for w in range(len(weights)):
                target_weights[w] = arglist.tau * weights[w] + (
                    1 - arglist.tau) * target_weights[w]
            model_t.set_weights(target_weights)
        elif terminal and train_step % 200 == 0:
            model_t.set_weights(model.get_weights())
Exemple #6
0
def main(arglist):
    global no_actions, no_features, no_agents
    env = u.make_env(arglist.scenario, arglist.no_agents)
    env.discrete_action_input = True

    obs_shape_n = env.observation_space
    no_agents = env.n
    batch_size = arglist.batch_size
    epsilon = arglist.epsilon
    epsilon_decay = arglist.epsilon_decay
    min_epsilon = arglist.min_epsilon
    max_epsilon = arglist.max_epsilon
    u.create_seed(arglist.seed)

    # Velocity.x Velocity.y Pos.x Pos.y {Land.Pos.x Land.Pos.y}*10 {Ent.Pos.x Ent.Pos.y}*9
    no_features = obs_shape_n[0].shape[0]
    no_actions = env.action_space[0].n
    model, model_t = __build_conf()
    optimizer = tf.keras.optimizers.Adam(lr=arglist.lr)
    # Results
    episode_rewards = [0.0]  # sum of rewards for all agents
    result_path = os.path.join("results", arglist.exp_name)
    res = os.path.join(result_path, "%s.csv" % arglist.exp_name)
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    replay_buffer = ReplayBuffer(arglist.max_buffer_size)  # Init Buffer
    episode_step = 0
    train_step = 0

    t_start = time.time()
    obs_n = env.reset()
    obs_n = u.reshape_state(obs_n, arglist.history_size)

    print('Starting iterations...')
    while True:
        episode_step += 1
        terminal = (episode_step >= arglist.max_episode_len)
        predictions = get_predictions(u.to_tensor(np.array(obs_n)), model)
        actions = get_actions(predictions, epsilon)

        # Observe next state, reward and done value
        try:
            new_obs_n, rew_n, done_n, _ = env.step(actions)
        except:
            print(actions)
            RuntimeError('Actions error!')
        new_obs_n = u.refresh_history(np.copy(obs_n), new_obs_n)
        done = all(done_n) or terminal
        cooperative_reward = rew_n[0]
        # Store the data in the replay memory
        replay_buffer.add(obs_n, actions, cooperative_reward, new_obs_n, done)
        obs_n = np.copy(new_obs_n)
        episode_rewards[-1] += cooperative_reward

        if done or terminal:
            obs_n = env.reset()
            obs_n = u.reshape_state(obs_n, arglist.history_size)
            if arglist.decay_mode.lower() == "linear":
                # straight line equation wrapper by max operation -> max(min_value,(-mx + b))
                epsilon = np.amax(
                    (min_epsilon,
                     -((max_epsilon - min_epsilon) * train_step /
                       arglist.max_episode_len) / arglist.e_lin_decay + 1.0))
            elif arglist.decay_mode.lower() == "exp":
                # exponential's function Const(e^-t) wrapped by a min function
                epsilon = np.amin(
                    (1, (min_epsilon + (max_epsilon - min_epsilon) *
                         np.exp(-(train_step / arglist.max_episode_len - 1) /
                                epsilon_decay))))
            else:
                epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
                    -epsilon_decay * train_step / arglist.max_episode_len)
            episode_step = 0
            episode_rewards.append(0)

        # increment global step counter
        train_step += 1

        # for displaying learned policies
        if arglist.display:
            time.sleep(0.1)
            env.render()
            continue

        # Train the models
        if replay_buffer.can_provide_sample(
                batch_size, arglist.max_episode_len) and train_step % 100 == 0:
            state, actions, rewards, new_state, dones = replay_buffer.sample(
                batch_size)

            # Calculate TD-target. The Model.predict() method returns numpy() array without taping the forward pass.
            target_q_values = model_t(reformat_input(new_state))
            # Apply max(Q) to obtain the TD-target
            target_q_tot = tf.reduce_max(target_q_values, axis=-1)
            # Apply VDN to reduce the agent-dimension
            max_q_tot = tf.reduce_sum(target_q_tot, axis=-1)
            y = rewards + (1. - dones) * arglist.gamma * max_q_tot
            with tf.GradientTape() as tape:
                # Predictions
                action_one_hot = tf.one_hot(actions,
                                            no_actions,
                                            name='action_one_hot')
                q_values = model(reformat_input(state))
                q_tot = tf.reduce_sum(q_values * action_one_hot,
                                      axis=-1,
                                      name='q_acted')
                pred = tf.reduce_sum(q_tot, axis=1)
                if "huber" in arglist.loss_type:
                    # Computing the Huber Loss
                    loss = tf.reduce_sum(
                        u.huber_loss(pred, tf.stop_gradient(y)))
                elif "mse" in arglist.loss_type:
                    # Computing the MSE loss
                    loss = tf.losses.mean_squared_error(
                        pred, tf.stop_gradient(y))

                gradients = tape.gradient(loss, model.trainable_variables)
                local_clipped = u.clip_by_local_norm(gradients, 0.1)
            optimizer.apply_gradients(
                zip(local_clipped, model.trainable_variables))
            tf.saved_model.save(model, result_path)

            # display training output
            if train_step % arglist.save_rate == 0:
                eval_reward = get_eval_reward(env, model)
                with open(res, "a+") as f:
                    mes_dict = {
                        "steps":
                        train_step,
                        "episodes":
                        len(episode_rewards),
                        "train_episode_reward":
                        np.round(np.mean(episode_rewards[-arglist.save_rate:]),
                                 3),
                        "eval_episode_reward":
                        np.round(np.mean(eval_reward), 3),
                        "loss":
                        round(loss.numpy(), 3),
                        "time":
                        round(time.time() - t_start, 3)
                    }
                    print(mes_dict)
                    for item in list(mes_dict.values()):
                        f.write("%s\t" % item)
                    f.write("\n")
                    f.close()
                t_start = time.time()

        # train target model
        update_target_networks(model, model_t)