Beispiel #1
0
def train(arglist, restore_model_number):
    debug = False
    multi_process = arglist.mp
    num_tasks = arglist.num_task_transfer  # 总共有多少个任务
    list_of_taskenv = []  # env list
    save_path = arglist.save_dir
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    with U.single_threaded_session():
        sess = tf.get_default_session()
        if debug:
            begin = time_begin()
        # 1.1创建每个任务的actor trainer和critic trainer
        env = make_env(arglist.scenario, reward_type=arglist.reward_type)
        env.set_map(
            sample_map(arglist.test_data_dir + arglist.test_data_name +
                       "_1.h5"))

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        actor_0 = get_trainers(env,
                               "actor_",
                               num_adversaries,
                               obs_shape_n,
                               arglist,
                               type=0,
                               session=sess)

        # 1.2创建每个任务的actor trainer和critic trainer
        critic_list = []  # 所有任务critic的list
        actor_list = []

        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # 1.2 全局变量初始化
        episodes_rewards = [[0.0] for _ in range(num_tasks)
                            ]  # 每个元素为在一个episode中所有agents rewards的和
        # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和
        agent_rewards = [[[0.0] for _ in range(env.n)]
                         for _ in range(num_tasks)]
        final_ep_rewards = [[] for _ in range(num_tasks)
                            ]  # sum of rewards for training curve
        final_ep_ag_rewards = [[] for _ in range(num_tasks)
                               ]  # agent rewards for training curve

        energy_consumptions_for_test = [[] for _ in range(num_tasks)]
        j_index = [[] for _ in range(num_tasks)]
        aver_cover = [[] for _ in range(num_tasks)]
        instantaneous_dis = [[] for _ in range(num_tasks)]
        instantaneous_out_the_map = [[] for _ in range(num_tasks)]
        energy_efficiency = [[] for _ in range(num_tasks)]
        instantaneous_accmulated_reward = [[] for _ in range(num_tasks)]

        model_number = int(arglist.num_train_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)

        # 1.3 局部变量初始化
        global_steps = np.zeros(num_tasks)
        local_steps = np.zeros(num_tasks)  # local timesteps for each env
        energy_one_episode = [[] for _ in range(num_tasks)]
        j_index_one_episode = [[] for _ in range(num_tasks)]
        aver_cover_one_episode = [[] for _ in range(num_tasks)]
        over_map_counter = np.zeros(num_tasks)
        over_map_one_episode = [[] for _ in range(num_tasks)]
        disconnected_number_counter = np.zeros(num_tasks)
        disconnected_number_one_episode = [[] for _ in range(num_tasks)]
        episode_reward_step = np.zeros(
            num_tasks)  # 累加一个episode里每一步的所有智能体的平均reward
        accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
        route_one_episode = [[] for _ in range(num_tasks)]

        if debug:
            print(time_end(begin, "step3"))
            begin = time_begin()

        # 1.4 加载checkpoints
        if arglist.load_dir == "":
            arglist.load_dir = os.path.join(save_path,
                                            str(restore_model_number),
                                            "model.ckpt")
        if arglist.transfer_restore:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        for i in range(num_tasks):
            list_of_taskenv.append(
                make_env(arglist.scenario, reward_type=arglist.reward_type))
            critic_trainers = get_trainers(list_of_taskenv[i],
                                           "task_" + str(i + 1) + "_",
                                           num_adversaries,
                                           obs_shape_n,
                                           arglist,
                                           actors=actor_0,
                                           type=1,
                                           session=sess)
            actor_trainers = get_trainers(list_of_taskenv[i],
                                          "task_" + str(i + 1) + "_",
                                          num_adversaries,
                                          obs_shape_n,
                                          arglist,
                                          actor_env_name="actor_",
                                          type=2,
                                          session=sess)
            actor_list.append(actor_trainers)
            critic_list.append(critic_trainers)

        # Initialize
        U.initialize()
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(var)

        # 1.5 初始化ENV
        obs_n_list = []
        for i in range(num_tasks):
            obs_n = list_of_taskenv[i].reset()
            list_of_taskenv[i].set_map(
                sample_map(arglist.test_data_dir + arglist.test_data_name +
                           "_" + str(i + 1) + ".h5"))
            obs_n_list.append(obs_n)

        if debug:
            print(time_end(begin, "initialize"))
            begin = time_begin()
        # 2.训练
        t_start = time.time()
        print('Starting iterations...')
        episode_start_time = time.time()
        state_dim = obs_shape_n[0][0]

        history_n = [[
            queue.Queue(arglist.history_length) for _ in range(env.n)
        ] for _ in range(num_tasks)]
        for i in range(num_tasks):
            for j in range(env.n):
                for _ in range(arglist.history_length):
                    history_n[i][j].put(obs_n_list[i][j])

        while True:
            for task_index in range(num_tasks):
                # 2.1更新环境,采集样本
                current_env = list_of_taskenv[task_index]
                # get action
                # action_n = [agent.action(obs) for agent, obs in zip(actor_0, obs_n_list[task_index])]
                action_n = [
                    agent.action(obs)
                    for agent, obs in zip(actor_0, history_n[task_index])
                ]
                # environment step
                new_obs_n, rew_n, done_n, info_n = current_env.step(action_n)
                current_critics = critic_list[task_index]
                current_actors = actor_list[task_index]
                if debug:
                    print(time_end(begin, "env.step"))
                    begin = time_begin()
                local_steps[task_index] += 1  # 更新局部计数器
                global_steps[task_index] += 1  # 更新全局计数器
                done = all(done_n)
                terminal = (local_steps[task_index] >= arglist.max_episode_len)
                # 收集experience
                for i in range(env.n):
                    current_critics[i].experience(obs_n_list[task_index][i],
                                                  action_n[i], rew_n[i],
                                                  new_obs_n[i], done_n[i],
                                                  terminal)

                # 更新obs
                obs_n_list[task_index] = new_obs_n
                for i in range(env.n):
                    history_n[task_index][i].get()
                    history_n[task_index][i].put(new_obs_n[i])
                # 更新reward
                for i, rew in enumerate(rew_n):
                    episodes_rewards[task_index][-1] += rew
                    agent_rewards[task_index][i][-1] += rew

                # 2.2,优化每一个任务的critic and acotr
                for critic in current_critics:
                    critic.preupdate()

                for critic in current_critics:
                    critic.update(current_critics, global_steps[task_index])

                for index, actor in enumerate(current_actors):
                    actor.update(current_actors, current_critics,
                                 global_steps[task_index], index)

                if debug:
                    print(time_end(begin, "update actor"))
                    begin = time_begin()

                # 2.4 记录和更新train信息
                # energy
                energy_one_episode[task_index].append(current_env.get_energy())
                # fair index
                j_index_one_episode[task_index].append(
                    current_env.get_jain_index())
                # coverage
                aver_cover_one_episode[task_index].append(
                    current_env.get_aver_cover())
                # over map counter
                over_map_counter[task_index] += current_env.get_over_map()
                over_map_one_episode[task_index].append(
                    over_map_counter[task_index])
                # disconnected counter
                disconnected_number_counter[task_index] += current_env.get_dis(
                )
                disconnected_number_one_episode[task_index].append(
                    disconnected_number_counter[task_index])
                # reward
                episode_reward_step[task_index] += np.mean(rew_n)
                accmulated_reward_one_episode[task_index].append(
                    episode_reward_step[task_index])
                route = current_env.get_agent_pos()
                route_one_episode[task_index].append(route)
                if debug:
                    print(time_end(begin, "others"))
                    begin = time_begin()

                episode_number = math.ceil(global_steps[task_index] /
                                           arglist.max_episode_len)
                if done or terminal:
                    model_name = save_path.split('/')[-2] + '/'
                    temp_efficiency = np.array(
                        aver_cover_one_episode[task_index]) * np.array(
                            j_index_one_episode[task_index]) / np.array(
                                energy_one_episode[task_index])
                    draw_util.draw_single_episode(
                        arglist.pictures_dir_transfer_train + model_name +
                        "single_episode_task_" + str(task_index) + "/",
                        episode_number, temp_efficiency,
                        aver_cover_one_episode[task_index],
                        j_index_one_episode[task_index],
                        energy_one_episode[task_index],
                        disconnected_number_one_episode[task_index],
                        over_map_one_episode[task_index],
                        accmulated_reward_one_episode[task_index])
                    # 记录每个episode的变量
                    energy_consumptions_for_test[task_index].append(
                        energy_one_episode[task_index][-1])  # energy
                    j_index[task_index].append(
                        j_index_one_episode[task_index][-1])  # fairness index
                    aver_cover[task_index].append(
                        aver_cover_one_episode[task_index][-1])  # coverage
                    instantaneous_dis[task_index].append(
                        disconnected_number_one_episode[task_index]
                        [-1])  # disconnected
                    instantaneous_out_the_map[task_index].append(
                        over_map_one_episode[task_index][-1])  # out of the map
                    instantaneous_accmulated_reward[task_index].append(
                        accmulated_reward_one_episode[task_index]
                        [-1])  # reward
                    energy_efficiency[task_index].append(
                        aver_cover_one_episode[task_index][-1] *
                        j_index_one_episode[task_index][-1] /
                        energy_one_episode[task_index][-1])  # efficiency

                    episode_end_time = time.time()
                    episode_time = episode_end_time - episode_start_time
                    episode_start_time = episode_end_time
                    with open(
                            arglist.pictures_dir_transfer_train + model_name +
                            "task_" + str(task_index) + '_train_info' + '.txt',
                            'a+') as f:
                        info = "Task index: %d, Episode number %d, energy consumption: %s, efficiency: %s, time: %s" % (
                            task_index, episode_number,
                            str(current_env.get_energy_origin()),
                            str(energy_efficiency[task_index][-1]),
                            str(round(episode_time, 3)))
                        f.write(info + "\n")
                    print(info)

                    # 应该在每个重置每个episode中的局部变量--------------------------------------------
                    if task_index == num_tasks - 1:
                        energy_one_episode = [[] for _ in range(num_tasks)]
                        j_index_one_episode = [[] for _ in range(num_tasks)]
                        aver_cover_one_episode = [[] for _ in range(num_tasks)]
                        over_map_counter = np.zeros(num_tasks)
                        over_map_one_episode = [[] for _ in range(num_tasks)]
                        disconnected_number_counter = np.zeros(num_tasks)
                        disconnected_number_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        episode_reward_step = np.zeros(num_tasks)
                        accmulated_reward_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        route_one_episode = [[] for _ in range(num_tasks)]

                    # 重置局部变量
                    obs_n_list[task_index] = current_env.reset()  # 重置env
                    current_env.set_map(
                        sample_map(arglist.test_data_dir +
                                   arglist.test_data_name + "_" +
                                   str(task_index + 1) + ".h5"))
                    local_steps[task_index] = 0  # 重置局部计数器

                    # 更新全局变量
                    episodes_rewards[task_index].append(0)  # 添加新的元素
                    for reward in agent_rewards[task_index]:
                        reward.append(0)

                # save model, display training output
                if terminal and (episode_number % arglist.save_rate == 0):
                    # tf.get_default_session().run(global_steps_assign_op, feed_dict={global_steps_ph: global_steps})
                    # save_dir_custom = os.path.join(save_path, str(episode_number), 'model.ckpt')
                    # U.save_state(save_dir_custom, saver=saver)
                    # print statement depends on whether or not there are adversaries
                    # 最新save_rate个episode的平均reward
                    save_rate_mean_reward = np.mean(
                        episodes_rewards[task_index][-arglist.save_rate:])
                    if num_adversaries == 0:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward,
                                    round(time.time() - t_start, 3)))
                    else:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward, [
                                        np.mean(rew[-arglist.save_rate:])
                                        for rew in agent_rewards[task_index]
                                    ], round(time.time() - t_start, 3)))

                    t_start = time.time()

                    final_ep_rewards[task_index].append(save_rate_mean_reward)
                    for rew in agent_rewards[task_index]:
                        final_ep_ag_rewards[task_index].append(
                            np.mean(rew[-arglist.save_rate:]))

                    # 保存train曲线
                    if arglist.draw_picture_train:
                        # model_name = save_path.split('/')[-2] + '/'
                        draw_util.draw_episodes(
                            episode_number,
                            arglist.pictures_dir_transfer_train + model_name +
                            "all_episodes_task_" + str(task_index) + "/",
                            aver_cover[task_index], j_index[task_index],
                            energy_consumptions_for_test[task_index],
                            instantaneous_dis[task_index],
                            instantaneous_out_the_map[task_index],
                            energy_efficiency[task_index],
                            instantaneous_accmulated_reward[task_index],
                            len(aver_cover[task_index]))
                # saves final episode reward for plotting training curve later
                if episode_number > arglist.num_train_episodes:
                    mkdir(arglist.plots_dir)
                    rew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_rewards.pkl'
                    with open(rew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_rewards, fp)
                    agrew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_agrewards.pkl'
                    with open(agrew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_ag_rewards, fp)
                        print('...Finished total of {} episodes.'.format(
                            episode_number))
            if episode_number > arglist.num_train_episodes:
                break
Beispiel #2
0
def train(arglist):
    debug = False
    num_tasks = arglist.num_task  # 总共有多少个任务
    list_of_taskenv = []  # env list
    save_path = arglist.save_dir
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    print("ok")
    with U.single_threaded_session():
        if debug:
            begin = time_begin()
        # 1.初始化
        # 1.1创建一个actor
        env = make_env(arglist.scenario, arglist)
        env.set_map(sample_map(arglist.data_path + "_1.h5"))
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        policy = get_trainers(env,
                              "pi_0_",
                              num_adversaries,
                              obs_shape_n,
                              arglist,
                              is_actor=True,
                              acotr=None)

        # 1.2创建每个任务的critic
        model_list = []  # 所有任务critic的list
        for i in range(num_tasks):
            # 创建每个任务的env
            list_of_taskenv.append(make_env(arglist.scenario, arglist))
            trainers = get_trainers(list_of_taskenv[i],
                                    "task_" + str(i + 1) + "_",
                                    num_adversaries,
                                    obs_shape_n,
                                    arglist,
                                    is_actor=False,
                                    acotr=policy)
            model_list.append(trainers)

        # 1.3 create p_train
        for task_index in range(num_tasks):
            for actor, critic in zip(policy, model_list[task_index]):
                actor.add_p(critic.name)
                critic.p = actor.p_train
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # 1.4 全局变量初始化
        episodes_rewards = [[0.0] for _ in range(num_tasks)
                            ]  # 每个元素为在一个episode中所有agents rewards的和
        # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和
        agent_rewards = [[[0.0] for _ in range(env.n)]
                         for _ in range(num_tasks)]
        final_ep_rewards = [[] for _ in range(num_tasks)
                            ]  # sum of rewards for training curve
        final_ep_ag_rewards = [[] for _ in range(num_tasks)
                               ]  # agent rewards for training curve

        energy_consumptions_for_test = [[] for _ in range(num_tasks)]
        j_index = [[] for _ in range(num_tasks)]
        aver_cover = [[] for _ in range(num_tasks)]
        instantaneous_dis = [[] for _ in range(num_tasks)]
        instantaneous_out_the_map = [[] for _ in range(num_tasks)]
        energy_efficiency = [[] for _ in range(num_tasks)]
        instantaneous_accmulated_reward = [[] for _ in range(num_tasks)]

        global_steps_tensor = tf.Variable(
            tf.zeros(num_tasks),
            trainable=False)  # global timesteps for each env
        global_steps_ph = tf.placeholder(tf.float32, [num_tasks])
        global_steps_assign_op = tf.assign(global_steps_tensor,
                                           global_steps_ph)
        model_number = int(arglist.num_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)

        efficiency_list = []
        for i in range(num_tasks):
            efficiency_list.append(
                tf.placeholder(tf.float32,
                               shape=None,
                               name="efficiency_placeholder" + str(i)))
        efficiency_summary_list = []
        for i in range(num_tasks):
            efficiency_summary_list.append(
                tf.summary.scalar("efficiency_%s" % i, efficiency_list[i]))
        writer = tf.summary.FileWriter("../summary/efficiency")

        # 1.5 episode局部变量初始化
        local_steps = np.zeros(num_tasks)  # local timesteps for each env
        t_start = time.time()

        energy_one_episode = [[] for _ in range(num_tasks)]
        j_index_one_episode = [[] for _ in range(num_tasks)]
        aver_cover_one_episode = [[] for _ in range(num_tasks)]
        over_map_counter = np.zeros(num_tasks)
        over_map_one_episode = [[] for _ in range(num_tasks)]
        disconnected_number_counter = np.zeros(num_tasks)
        disconnected_number_one_episode = [[] for _ in range(num_tasks)]
        episode_reward_step = np.zeros(
            num_tasks)  # 累加一个episode里每一步的所有智能体的平均reward
        accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
        route_one_episode = [[] for _ in range(num_tasks)]

        U.initialize()

        # 1.6 生成模型保存或者恢复文件夹目录
        if arglist.load_dir == "":
            arglist.load_dir = save_path
        if arglist.display or arglist.restore or arglist.benchmark:
            file_list = []
            for f in os.listdir(arglist.load_dir):
                if os.path.isdir(os.path.join(arglist.save_dir, f)):
                    file_list.append(f)
            file_list.sort(
                key=lambda fn: os.path.getmtime(arglist.load_dir + "/" + fn))
            if len(file_list) > num_tasks:
                load_dir = os.path.join(arglist.load_dir, file_list[-1],
                                        "model.ckpt")
                U.load_state(load_dir)
            print('Loading previous state...')

        global_steps = tf.get_default_session().run(global_steps_tensor)

        # 1.7 初始化ENV
        obs_n_list = []
        for i in range(num_tasks):
            obs_n = list_of_taskenv[i].reset()
            list_of_taskenv[i].set_map(
                sample_map(arglist.data_path + "_" + str(i + 1) + ".h5"))
            obs_n_list.append(obs_n)

        # 1.8 生成maddpg 加上rnn之后的输入seq,
        history_n = [[] for _ in range(num_tasks)]
        for i in range(num_tasks):
            for j in range(len(obs_n_list[i])):  # 生成每个智能体长度为history_length的观测
                history = History(arglist, [obs_shape_n[j][0]])
                history_n[i].append(history)
                for _ in range(arglist.history_length):
                    history_n[i][j].add(obs_n_list[i][j])
        if debug:
            print(time_end(begin, "initialize"))
            begin = time_begin()

        # 2.训练
        print('Starting iterations...')
        episode_start_time = time.time()
        state_dim = obs_shape_n[0][0]

        while True:
            for task_index in range(num_tasks):
                current_env = list_of_taskenv[task_index]
                action_n = []
                # 用critic获得state,用critic给出action,
                for agent, his in zip(policy, history_n[task_index]):
                    hiss = his.obtain().reshape(
                        1, state_dim,
                        arglist.history_length)  # [1, state_dim, length]
                    action = agent.action([hiss], [1])
                    action_n.append(action[0])
                if debug:
                    print(time_end(begin, "action2"))
                    begin = time_begin()
                new_obs_n, rew_n, done_n = current_env.step(action_n)
                current_critics = model_list[task_index]
                if debug:
                    print(time_end(begin, "env.step"))
                    begin = time_begin()

                local_steps[task_index] += 1  # 更新局部计数器
                global_steps[task_index] += 1  # 更新全局计数器
                done = all(done_n)
                terminal = (local_steps[task_index] >= arglist.max_episode_len)
                # 收集experience
                for i in range(env.n):
                    current_critics[i].experience(obs_n_list[task_index][i],
                                                  action_n[i], rew_n[i],
                                                  done_n[i], terminal)
                    policy[i].experience(obs_n_list[task_index][i],
                                         action_n[i], rew_n[i], done_n[i],
                                         terminal)

                # 更新obs
                obs_n_list[task_index] = new_obs_n
                if debug:
                    print(time_end(begin, "experience"))
                    begin = time_begin()
                # 2.2,优化每一个任务的critic
                for i, rew in enumerate(rew_n):
                    episodes_rewards[task_index][-1] += rew
                    agent_rewards[task_index][i][-1] += rew

                for critic in current_critics:
                    critic.preupdate()
                for critic in current_critics:
                    critic.update(current_critics, global_steps[task_index])

                if debug:
                    print(time_end(begin, "update critic"))
                    begin = time_begin()
                # 2.3,优化actor
                # policy_step += 1
                # print("policy steps: ", policy_step)
                for actor, critic in zip(policy, current_critics):
                    actor.change_p(critic.p)
                    actor.update(policy, global_steps[task_index])
                if debug:
                    print(time_end(begin, "update actor"))
                    begin = time_begin()
                # 2.4 记录和更新train过程
                # energy
                energy_one_episode[task_index].append(current_env.get_energy())
                # fair index
                j_index_one_episode[task_index].append(
                    current_env.get_jain_index())
                # coverage
                aver_cover_one_episode[task_index].append(
                    current_env.get_aver_cover())
                # over map counter
                over_map_counter[task_index] += current_env.get_over_map()
                over_map_one_episode[task_index].append(
                    over_map_counter[task_index])
                # disconnected counter
                disconnected_number_counter[task_index] += current_env.get_dis(
                )
                disconnected_number_one_episode[task_index].append(
                    disconnected_number_counter[task_index])
                # reward
                episode_reward_step[task_index] += np.mean(rew_n)
                accmulated_reward_one_episode[task_index].append(
                    episode_reward_step[task_index])
                route = current_env.get_agent_pos()
                route_one_episode[task_index].append(route)

                if debug:
                    print(time_end(begin, "others"))
                    begin = time_begin()

                episode_number = math.ceil(global_steps[task_index] /
                                           arglist.max_episode_len)
                if done or terminal:
                    model_name = save_path.split('/')[-2] + '/'
                    temp_efficiency = np.array(
                        aver_cover_one_episode[task_index]) * np.array(
                            j_index_one_episode[task_index]) / np.array(
                                energy_one_episode[task_index])
                    draw_util.draw_single_episode(
                        arglist.pictures_dir_train + model_name +
                        "single_episode_task_" + str(task_index) + "/",
                        episode_number, temp_efficiency,
                        aver_cover_one_episode[task_index],
                        j_index_one_episode[task_index],
                        energy_one_episode[task_index],
                        disconnected_number_one_episode[task_index],
                        over_map_one_episode[task_index],
                        accmulated_reward_one_episode[task_index])
                    # 记录每个episode的变量
                    energy_consumptions_for_test[task_index].append(
                        energy_one_episode[task_index][-1])  # energy
                    j_index[task_index].append(
                        j_index_one_episode[task_index][-1])  # fairness index
                    aver_cover[task_index].append(
                        aver_cover_one_episode[task_index][-1])  # coverage
                    instantaneous_dis[task_index].append(
                        disconnected_number_one_episode[task_index]
                        [-1])  # disconnected
                    instantaneous_out_the_map[task_index].append(
                        over_map_one_episode[task_index][-1])  # out of the map
                    instantaneous_accmulated_reward[task_index].append(
                        accmulated_reward_one_episode[task_index]
                        [-1])  # reward
                    energy_efficiency[task_index].append(
                        aver_cover_one_episode[task_index][-1] *
                        j_index_one_episode[task_index][-1] /
                        energy_one_episode[task_index][-1])  # efficiency

                    episode_end_time = time.time()
                    episode_time = episode_end_time - episode_start_time
                    episode_start_time = episode_end_time
                    print(
                        'Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s'
                        % (task_index, episode_number,
                           str(current_env.get_energy_origin()),
                           str(energy_efficiency[task_index][-1]),
                           str(round(episode_time, 3))))

                    # 绘制reward曲线
                    efficiency_s = tf.get_default_session().run(
                        efficiency_summary_list[task_index],
                        feed_dict={
                            efficiency_list[task_index]:
                            energy_efficiency[task_index][-1]
                        })
                    writer.add_summary(efficiency_s,
                                       global_step=episode_number)

                    # 应该在每个重置每个episode中的局部变量--------------------------------------------
                    if task_index == num_tasks - 1:
                        energy_one_episode = [[] for _ in range(num_tasks)]
                        j_index_one_episode = [[] for _ in range(num_tasks)]
                        aver_cover_one_episode = [[] for _ in range(num_tasks)]
                        over_map_counter = np.zeros(num_tasks)
                        over_map_one_episode = [[] for _ in range(num_tasks)]
                        disconnected_number_counter = np.zeros(num_tasks)
                        disconnected_number_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        episode_reward_step = np.zeros(num_tasks)
                        accmulated_reward_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        route_one_episode = [[] for _ in range(num_tasks)]

                    # 重置局部变量
                    obs_n_list[task_index] = current_env.reset()  # 重置env
                    current_env.set_map(
                        sample_map(arglist.data_path + "_" +
                                   str(task_index + 1) + ".h5"))
                    local_steps[task_index] = 0  # 重置局部计数器

                    # 更新全局变量
                    episodes_rewards[task_index].append(0)  # 添加新的元素
                    for reward in agent_rewards[task_index]:
                        reward.append(0)

                # save model, display training output
                if terminal and (episode_number % arglist.save_rate == 0):
                    tf.get_default_session().run(
                        global_steps_assign_op,
                        feed_dict={global_steps_ph: global_steps})
                    save_dir_custom = save_path + str(
                        episode_number) + '/model.ckpt'
                    U.save_state(save_dir_custom, saver=saver)
                    # print statement depends on whether or not there are adversaries
                    # 最新save_rate个episode的平均reward
                    save_rate_mean_reward = np.mean(
                        episodes_rewards[task_index][-arglist.save_rate:])
                    if num_adversaries == 0:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward,
                                    round(time.time() - t_start, 3)))
                    else:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward, [
                                        np.mean(rew[-arglist.save_rate:])
                                        for rew in agent_rewards[task_index]
                                    ], round(time.time() - t_start, 3)))

                    t_start = time.time()

                    final_ep_rewards[task_index].append(save_rate_mean_reward)
                    for rew in agent_rewards[task_index]:
                        final_ep_ag_rewards[task_index].append(
                            np.mean(rew[-arglist.save_rate:]))

                    # 保存train曲线
                    if arglist.draw_picture_train:
                        # model_name = save_path.split('/')[-2] + '/'
                        draw_util.draw_episodes(
                            episode_number,
                            arglist.pictures_dir_train + model_name +
                            "all_episodes_task_" + str(task_index) + "/",
                            aver_cover[task_index], j_index[task_index],
                            energy_consumptions_for_test[task_index],
                            instantaneous_dis[task_index],
                            instantaneous_out_the_map[task_index],
                            energy_efficiency[task_index],
                            instantaneous_accmulated_reward[task_index],
                            len(aver_cover[task_index]))
                # saves final episode reward for plotting training curve later
                if episode_number > arglist.num_episodes:
                    rew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_rewards.pkl'
                    with open(rew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_rewards, fp)
                    agrew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_agrewards.pkl'
                    with open(agrew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_ag_rewards, fp)
                        print('...Finished total of {} episodes.'.format(
                            episode_number))
            if episode_number > arglist.num_episodes:
                break
Beispiel #3
0
def train(arglist):
    debug = False
    arglist.save_dir = arglist.save_dir + "_batch_size_" + str(
        arglist.batch_size) + "_buffer_size_" + str(arglist.buffer_size)
    with U.single_threaded_session():
        if debug:
            begin = time_begin()
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        if debug:
            print(time_end(begin, "step 0"))
            begin = time_begin()
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        if debug:
            print(time_end(begin, "step 1"))
            begin = time_begin()
        trainers = get_trainers(env, "task_", num_adversaries, obs_shape_n,
                                arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        if debug:
            print(time_end(begin, "step2"))
            begin = time_begin()

        efficiency = tf.placeholder(tf.float32,
                                    shape=None,
                                    name="efficiency_placeholder")
        efficiency_summary = tf.summary.scalar("efficiency", efficiency)
        p_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="p_loss")
        p_losses_summary = tf.summary.histogram("loss", p_losses_ph)
        q_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="q_loss")
        q_losses_summary = tf.summary.histogram("loss", q_losses_ph)
        loss_summary = tf.summary.merge([q_losses_summary, p_losses_summary],
                                        name="loss")
        writer = tf.summary.FileWriter("../summary/efficiency")
        writer2 = tf.summary.FileWriter("../summary/loss")

        # Initialize
        U.initialize()
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(var)
        if debug:
            print(time_end(begin, "step3"))
            begin = time_begin()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        model_number = int(arglist.num_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)
        episode_step = 0
        train_step = 0
        t_start = time.time()
        # custom statistics variable------------------------------------------------------------------------------------
        loss_all = []
        aver_cover = []
        j_index = []
        instantaneous_accmulated_reward = []
        instantaneous_dis = []
        instantaneous_out_the_map = []
        # q_value = []
        energy_consumptions_for_test = []
        bl_coverage = 0.8
        bl_jainindex = 0.8
        bl_loss = 100
        energy_efficiency = []

        over_map_counter = 0
        over_map_one_episode = []
        aver_cover_one_episode = []
        j_index_one_episode = []
        disconnected_number_counter = 0
        disconnected_number_one_episode = []
        accmulated_reward_one_episode = []
        actions = []
        energy_one_episode = []
        route = []
        obs_n = env.reset()

        episode_reward_step = 0

        model_name = arglist.load_dir.split(
            '/')[-3] + '/' + arglist.load_dir.split('/')[-2] + '/'
        if FLAGS.greedy_action:
            model_name = model_name + 'greedy/'
        elif FLAGS.random_action:
            model_name = model_name + 'random/'

        # if debug:
        #     print(time_end(begin, "initialize"))
        #     begin = time_begin()
        print('Starting iterations...')
        episode_begin_time = time.time()
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # increment custom statistics variables in the epoch--------------------------------------------------------
            episode_reward_step += np.mean(rew_n)
            j_index_one_episode.append(env.get_jain_index())
            over_map_counter += env.get_over_map()
            over_map_one_episode.append(over_map_counter)
            disconnected_number_counter += env.get_dis()
            disconnected_number_one_episode.append(disconnected_number_counter)
            aver_cover_one_episode.append(env.get_aver_cover())
            energy_one_episode.append(env.get_energy())
            s_route = env.get_state()
            for route_i in range(0, FLAGS.num_uav * 2, 2):
                tmp = [s_route[route_i], s_route[route_i + 1]]
                route.append(tmp)
            accmulated_reward_one_episode.append(episode_reward_step)
            # if debug:
            #     print(time_end(begin, "others"))
            #     begin = time_begin()
            if done or terminal:
                model_name = arglist.save_dir.split('/')[-1] + '/'
                episode_number = int(train_step / arglist.max_episode_len)
                temp_efficiency = np.array(aver_cover_one_episode) * np.array(
                    j_index_one_episode) / np.array(energy_one_episode)
                draw_util.draw_single_episode(
                    arglist.pictures_dir_train + model_name +
                    "single_episode/", episode_number, temp_efficiency,
                    aver_cover_one_episode, j_index_one_episode,
                    energy_one_episode, disconnected_number_one_episode,
                    over_map_one_episode, accmulated_reward_one_episode)

                # reset custom statistics variabl between episode and epoch---------------------------------------------
                instantaneous_accmulated_reward.append(
                    accmulated_reward_one_episode[-1])
                j_index.append(j_index_one_episode[-1])
                instantaneous_dis.append(disconnected_number_one_episode[-1])
                instantaneous_out_the_map.append(over_map_one_episode[-1])
                aver_cover.append(aver_cover_one_episode[-1])
                energy_consumptions_for_test.append(energy_one_episode[-1])
                energy_efficiency.append(aver_cover_one_episode[-1] *
                                         j_index_one_episode[-1] /
                                         energy_one_episode[-1])
                episode_end_time = time.time()

                # plot fig
                efficiency_s = tf.get_default_session().run(
                    efficiency_summary,
                    feed_dict={efficiency: energy_efficiency[episode_number]})
                writer.add_summary(efficiency_s, global_step=episode_number)
                # plt fig
                print(
                    'Episode: %d - energy_consumptions: %s, efficiency: %s, time %s'
                    %
                    (train_step / arglist.max_episode_len,
                     str(env.get_energy_origin()), str(energy_efficiency[-1]),
                     str(round(episode_end_time - episode_begin_time, 3))))
                episode_begin_time = episode_end_time
                # draw picture of this episode
                if arglist.draw_picture_test and aver_cover[-1] >= bl_coverage and j_index[-1] >= bl_jainindex \
                        and instantaneous_dis[-1] <= bl_loss:
                    episode_number_name = 'episode_' + str(episode_number)
                    draw_util.draw(episode_number_name,
                                   arglist.pictures_dir_test + model_name,
                                   energy_one_episode, route, actions,
                                   aver_cover_one_episode, j_index_one_episode,
                                   accmulated_reward_one_episode,
                                   disconnected_number_one_episode,
                                   over_map_one_episode,
                                   arglist.max_episode_len)

                j_index_one_episode = []
                over_map_counter = 0
                over_map_one_episode = []
                disconnected_number_counter = 0
                disconnected_number_one_episode = []
                aver_cover_one_episode = []
                energy_one_episode = []
                route = []
                episode_reward_step = 0
                accmulated_reward_one_episode = []

                if arglist.draw_picture_test:
                    if len(episode_rewards) % arglist.save_rate == 0:
                        episode_number_name = train_step / arglist.max_episode_len
                        draw_util.drawTest(
                            episode_number_name,
                            arglist.pictures_dir_train + model_name,
                            energy_consumptions_for_test, aver_cover, j_index,
                            instantaneous_accmulated_reward,
                            instantaneous_dis, instantaneous_out_the_map,
                            len(aver_cover), bl_coverage, bl_jainindex,
                            bl_loss, energy_efficiency, False)
                # reset custom statistics variabl between episode and epoch---------------------------------------------

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.draw_picture_test:
                if len(episode_rewards) > arglist.num_episodes:
                    break
                continue
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            p_loss_list = []
            q_loss_list = []
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                temp = agent.update(trainers, train_step)
                if temp is not None:
                    p_loss_list.append(temp[1])
                    q_loss_list.append(temp[0])
            if len(p_loss_list) == env.n:
                loss_s = tf.get_default_session().run(loss_summary,
                                                      feed_dict={
                                                          p_losses_ph:
                                                          p_loss_list,
                                                          q_losses_ph:
                                                          q_loss_list
                                                      })
                writer2.add_summary(loss_s, global_step=train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                episode_number_name = train_step / arglist.max_episode_len
                save_dir_custom = arglist.save_dir + "/" + str(
                    episode_number_name) + '/'
                # save_dir
                U.save_state(save_dir_custom, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))
                # draw custom statistics picture when save the model----------------------------------------------------
                if arglist.draw_picture_train:
                    episode_number_name = train_step / arglist.max_episode_len
                    model_name = arglist.save_dir.split('/')[-1] + '/'
                    draw_util.draw_episodes(
                        episode_number_name, arglist.pictures_dir_train +
                        model_name + "all_episodes/", aver_cover, j_index,
                        energy_consumptions_for_test, instantaneous_dis,
                        instantaneous_out_the_map, energy_efficiency,
                        instantaneous_accmulated_reward, len(aver_cover))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break