Esempio n. 1
0
def test(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        model_name = arglist.load_dir.split('/')[-2] + '/'
        if FLAGS.greedy_action:
            model_name = model_name + 'greedy/'
        elif FLAGS.random_action:
            model_name = model_name + 'random/'

        model_index_step = 0
        model_number_total = arglist.train_num_episodes / arglist.save_rate
        max_model_index = 0
        max_average_energy_efficiency = 0
        draw_util.mkdir(arglist.pictures_dir_test + model_name)
        while True:
            # Initialize
            U.initialize()
            if model_index_step >= model_number_total:
                with open(
                        arglist.pictures_dir_test + model_name +
                        'test_report' + '.txt', 'a+') as file:
                    report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \
                             '\nMax average energy efficiency:' + str(max_average_energy_efficiency)
                    file.write(report)
                break
            else:
                model_index_step += 1
            # Load previous results, if necessary
            if arglist.load_dir == "":
                arglist.load_dir = arglist.save_dir
            if arglist.display or arglist.restore or arglist.benchmark:
                print('Loading previous state...')
                #model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/'
                model_load_dir = arglist.load_dir + str(3299) + '/'
                U.load_state(model_load_dir)

            episode_rewards = [0.0]  # sum of rewards for all agents
            agent_rewards = [[0.0]
                             for _ in range(env.n)]  # individual agent reward
            final_ep_rewards = []  # sum of rewards for training curve
            final_ep_ag_rewards = []  # agent rewards for training curve
            agent_info = [[[]]]  # placeholder for benchmarking info
            saver = tf.train.Saver()
            obs_n = env.reset()
            episode_step = 0
            train_step = 0
            t_start = time.time()
            # custom statistics variable--------------------------------------------------------------------------------
            loss_all = []
            aver_cover = []
            j_index = []
            instantaneous_accmulated_reward = []
            instantaneous_dis = []
            instantaneous_out_the_map = []
            # q_value = []
            energy_consumptions_for_test = []
            bl_coverage = 0.8
            bl_jainindex = 0.8
            bl_loss = 100
            energy_efficiency = []

            over_map_counter = 0
            over_map_one_episode = []
            aver_cover_one_episode = []
            j_index_one_episode = []
            disconnected_number_counter = 0
            disconnected_number_one_episode = []
            accmulated_reward_one_episode = []
            actions = []
            energy_one_episode = []
            route = []

            episode_reward_step = 0
            print('Starting iterations...')
            route_dict = {}
            for i in range(FLAGS.num_uav):
                key_temp = "UAV" + str(i + 1)
                route_dict[key_temp] = []
            while True:
                # get action
                action_n = [
                    agent.action(obs) for agent, obs in zip(trainers, obs_n)
                ]
                # environment step
                new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                episode_step += 1
                done = all(done_n)
                terminal = (episode_step >= arglist.max_episode_len)
                # collect experience
                for i, agent in enumerate(trainers):
                    agent.experience(obs_n[i], action_n[i], rew_n[i],
                                     new_obs_n[i], done_n[i], terminal)
                obs_n = new_obs_n

                for i, rew in enumerate(rew_n):
                    episode_rewards[-1] += rew
                    agent_rewards[i][-1] += rew

                # increment custom statistics variables in the epoch--------------------------------------------------------
                episode_reward_step += np.mean(rew_n)
                j_index_one_episode.append(env._get_jain_index())
                over_map_counter += env._get_over_map()
                over_map_one_episode.append(over_map_counter)
                disconnected_number_counter += env._get_dis()
                disconnected_number_one_episode.append(
                    disconnected_number_counter)
                aver_cover_one_episode.append(env._get_aver_cover())
                energy_one_episode.append(env._get_energy())

                s_route = env._get_state()
                for index, route_i in enumerate(range(0, FLAGS.num_uav * 2,
                                                      2)):
                    # for piao zong
                    route_dict["UAV" + str(index + 1)].append(
                        [s_route[route_i], s_route[route_i + 1]])
                    # for piao zong

                accmulated_reward_one_episode.append(episode_reward_step)

                if done or terminal:
                    ### for piaozong
                    uav_poss_file = "~/UAVNumber_" + str(
                        FLAGS.num_uav) + ".json"
                    route_str = json.dumps(route_dict)
                    with open(uav_poss_file, "w+") as f:
                        f.write(route_str)
                    ### for piaozong

                    # reset custom statistics variabl between episode and epoch---------------------------------------------
                    instantaneous_accmulated_reward.append(
                        accmulated_reward_one_episode[-1])
                    j_index.append(j_index_one_episode[-1])
                    instantaneous_dis.append(
                        disconnected_number_one_episode[-1])
                    instantaneous_out_the_map.append(over_map_one_episode[-1])
                    aver_cover.append(aver_cover_one_episode[-1])
                    energy_consumptions_for_test.append(energy_one_episode[-1])
                    energy_efficiency.append(aver_cover_one_episode[-1] *
                                             j_index_one_episode[-1] /
                                             energy_one_episode[-1])
                    print('Episode: %d - energy_consumptions: %s ' %
                          (train_step / arglist.max_episode_len,
                           str(env._get_energy_origin())))

                    j_index_one_episode = []
                    over_map_counter = 0
                    over_map_one_episode = []
                    disconnected_number_counter = 0
                    disconnected_number_one_episode = []
                    aver_cover_one_episode = []
                    energy_one_episode = []
                    route = []
                    episode_reward_step = 0
                    accmulated_reward_one_episode = []

                    if arglist.draw_picture_test:
                        if len(episode_rewards) % arglist.save_rate == 0:
                            if np.mean(energy_efficiency
                                       ) > max_average_energy_efficiency:
                                max_model_index = model_index_step * arglist.save_rate - 1
                                max_average_energy_efficiency = np.mean(
                                    energy_efficiency)
                            with open(
                                    arglist.pictures_dir_test + model_name +
                                    'test_report' + '.txt', 'a+') as file:
                                report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \
                                         '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \
                                         '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \
                                         '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \
                                         '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \
                                         '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n'
                                file.write(report)
                            draw_util.drawTest(
                                model_index_step * arglist.save_rate - 1,
                                arglist.pictures_dir_test + model_name,
                                energy_consumptions_for_test, aver_cover,
                                j_index, instantaneous_accmulated_reward,
                                instantaneous_dis, instantaneous_out_the_map,
                                len(aver_cover), bl_coverage, bl_jainindex,
                                bl_loss, energy_efficiency, False)
                    # reset custom statistics variabl between episode and epoch-----------------------------------------

                    obs_n = env.reset()
                    episode_step = 0
                    episode_rewards.append(0)
                    for a in agent_rewards:
                        a.append(0)
                    agent_info.append([[]])

                # increment global step counter
                train_step += 1

                # for benchmarking learned policies
                if arglist.benchmark:
                    for i, info in enumerate(info_n):
                        agent_info[-1][i].append(info_n['n'])
                    if train_step > arglist.benchmark_iters and (done
                                                                 or terminal):
                        file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                        print('Finished benchmarking, now saving...')
                        with open(file_name, 'wb') as fp:
                            pickle.dump(agent_info[:-1], fp)
                        break
                    continue

                # for displaying learned policies
                if arglist.draw_picture_test:
                    if len(episode_rewards) > arglist.num_episodes:
                        break
                    continue
                if arglist.display:
                    time.sleep(0.1)
                    env.render()
                    continue

                # update all trainers, if not in display or benchmark mode
                loss = None
                for agent in trainers:
                    agent.preupdate()
                for agent in trainers:
                    loss = agent.update(trainers, train_step)

                # save model, display training output
                if terminal and (len(episode_rewards) % arglist.save_rate
                                 == 0):
                    episode_number_name = train_step / arglist.max_episode_len
                    save_dir_custom = arglist.save_dir + str(
                        episode_number_name) + '/'
                    U.save_state(save_dir_custom, saver=saver)
                    # print statement depends on whether or not there are adversaries
                    if num_adversaries == 0:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                            .format(
                                train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                    else:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(
                                train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                    t_start = time.time()
                    # Keep track of final episode reward
                    final_ep_rewards.append(
                        np.mean(episode_rewards[-arglist.save_rate:]))
                    for rew in agent_rewards:
                        final_ep_ag_rewards.append(
                            np.mean(rew[-arglist.save_rate:]))
                    # draw custom statistics picture when save the model----------------------------------------------------
                    if arglist.draw_picture_train:
                        episode_number_name = train_step / arglist.max_episode_len
                        model_name = arglist.save_dir.split('/')[-2] + '/'
                        draw_util.draw_episode(
                            episode_number_name,
                            arglist.pictures_dir_train + model_name,
                            aver_cover, j_index,
                            instantaneous_accmulated_reward, instantaneous_dis,
                            instantaneous_out_the_map, loss_all,
                            len(aver_cover))

                # saves final episode reward for plotting training curve later
                if len(episode_rewards) > arglist.num_episodes:
                    rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                    with open(rew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_rewards, fp)
                    agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                    with open(agrew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_ag_rewards, fp)
                    print('...Finished total of {} episodes.'.format(
                        len(episode_rewards)))
                    break
Esempio n. 2
0
def train(arglist):
    debug = False
    arglist.save_dir = arglist.save_dir + "batch_size" + str(arglist.batch_size) + "buffer_size" + str(arglist.buffer_size)
    with U.single_threaded_session():
        if debug:
            begin = time_begin()
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        if debug:
            print(time_end(begin, "step 0"))
            begin = time_begin()
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        if debug:
            print(time_end(begin, "step 1"))
            begin = time_begin()
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
        if debug:
            print(time_end(begin, "step2"))
            begin = time_begin()
        
        efficiency = tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder")
        efficiency_summary = tf.summary.scalar("efficiency", efficiency)
        p_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="p_loss")
        p_losses_summary = tf.summary.histogram("loss", p_losses_ph)
        q_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="q_loss")
        q_losses_summary = tf.summary.histogram("loss", q_losses_ph)
        loss_summary = tf.summary.merge([q_losses_summary, p_losses_summary], name="loss")
        writer = tf.summary.FileWriter("../summary/efficiency")
        writer2 = tf.summary.FileWriter("../summary/loss")

        # Initialize
        U.initialize()
        if debug:
            print(time_end(begin, "step3"))
            begin = time_begin()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        model_number = int(arglist.num_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)
        episode_step = 0
        train_step = 0
        t_start = time.time()
        # custom statistics variable------------------------------------------------------------------------------------
        loss_all = []
        aver_cover = []
        j_index = []
        instantaneous_accmulated_reward = []
        instantaneous_dis = []
        instantaneous_out_the_map = []
        # q_value = []
        energy_consumptions_for_test = []
        bl_coverage = 0.8
        bl_jainindex = 0.8
        bl_loss = 100
        energy_efficiency = []

        over_map_counter = 0
        over_map_one_episode = []
        aver_cover_one_episode = []
        j_index_one_episode = []
        disconnected_number_counter = 0
        disconnected_number_one_episode = []
        accmulated_reward_one_episode = []
        actions = []
        energy_one_episode = []
        route = []
        obs_n = env.reset()
        
        episode_reward_step = 0

        model_name = arglist.load_dir.split('/')[-3] + '/' + arglist.load_dir.split('/')[-2] + '/'
        if FLAGS.greedy_action:
            model_name = model_name + 'greedy/'
        elif FLAGS.random_action:
            model_name = model_name + 'random/'
        
        # if debug:
        #     print(time_end(begin, "initialize"))
        #     begin = time_begin()
        print('Starting iterations...')
        episode_begin_time = time.time()
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # increment custom statistics variables in the epoch--------------------------------------------------------
            episode_reward_step += np.mean(rew_n)
            j_index_one_episode.append(env._get_jain_index())
            over_map_counter += env._get_over_map()
            over_map_one_episode.append(over_map_counter)
            disconnected_number_counter += env._get_dis()
            disconnected_number_one_episode.append(disconnected_number_counter)
            aver_cover_one_episode.append(env._get_aver_cover())
            energy_one_episode.append(env._get_energy())
            s_route = env._get_state()
            for route_i in range(0, FLAGS.num_uav * 2, 2):
                tmp = [s_route[route_i], s_route[route_i + 1]]
                route.append(tmp)
            accmulated_reward_one_episode.append(episode_reward_step)
            # if debug:
            #     print(time_end(begin, "others"))
            #     begin = time_begin()
            if done or terminal:
                model_name = arglist.save_dir.split('/')[-1] + '/'
                episode_number = int(train_step / arglist.max_episode_len)
                temp_efficiency = np.array(aver_cover_one_episode) * np.array(
                    j_index_one_episode) / np.array(energy_one_episode)
                draw_util.draw_single_episode(arglist.pictures_dir_train + model_name + "single_episode/",
                                              episode_number,
                                              temp_efficiency,
                                              aver_cover_one_episode,
                                              j_index_one_episode,
                                              energy_one_episode,
                                              disconnected_number_one_episode,
                                              over_map_one_episode,
                                              accmulated_reward_one_episode)

                # reset custom statistics variabl between episode and epoch---------------------------------------------
                instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1])
                j_index.append(j_index_one_episode[-1])
                instantaneous_dis.append(disconnected_number_one_episode[-1])
                instantaneous_out_the_map.append(over_map_one_episode[-1])
                aver_cover.append(aver_cover_one_episode[-1])
                energy_consumptions_for_test.append(energy_one_episode[-1])
                energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1])
                episode_end_time = time.time()
                
                # plot fig
                efficiency_s = tf.get_default_session().run(efficiency_summary, feed_dict={efficiency: energy_efficiency[episode_number]})
                writer.add_summary(efficiency_s, global_step=episode_number)
                # plt fig
                print('Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (train_step / arglist.max_episode_len,
                                                            str(env._get_energy_origin()),
                                                            str(energy_efficiency[-1]),
                                                            str(round(episode_end_time - episode_begin_time, 3))))
                episode_begin_time = episode_end_time
                # draw picture of this episode
                if arglist.draw_picture_test and aver_cover[-1] >= bl_coverage and j_index[-1] >= bl_jainindex \
                        and instantaneous_dis[-1] <= bl_loss:
                    episode_number_name = 'episode_' + str(episode_number)
                    draw_util.draw(episode_number_name, arglist.pictures_dir_test + model_name, energy_one_episode,
                                   route, actions, aver_cover_one_episode, j_index_one_episode,
                                   accmulated_reward_one_episode, disconnected_number_one_episode, over_map_one_episode,
                                   arglist.max_episode_len)

                j_index_one_episode = []
                over_map_counter = 0
                over_map_one_episode = []
                disconnected_number_counter = 0
                disconnected_number_one_episode = []
                aver_cover_one_episode = []
                energy_one_episode = []
                route = []
                episode_reward_step = 0
                accmulated_reward_one_episode = []

                if arglist.draw_picture_test:
                    if len(episode_rewards) % arglist.save_rate == 0:
                        episode_number_name = train_step / arglist.max_episode_len
                        draw_util.drawTest(episode_number_name,
                                           arglist.pictures_dir_train + model_name,
                                           energy_consumptions_for_test,
                                           aver_cover, j_index,
                                           instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map
                                           , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False)
                # reset custom statistics variabl between episode and epoch---------------------------------------------

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.draw_picture_test:
                if len(episode_rewards) > arglist.num_episodes:
                    break
                continue
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            p_loss_list = []
            q_loss_list = []
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                temp = agent.update(trainers, train_step)
                if temp is not None:
                    p_loss_list.append(temp[1])
                    q_loss_list.append(temp[0])
            if len(p_loss_list) == env.n:
                loss_s = tf.get_default_session().run(loss_summary,
                                                      feed_dict={p_losses_ph: p_loss_list,
                                                                 q_losses_ph: q_loss_list})
                writer2.add_summary(loss_s, global_step=train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                episode_number_name = train_step / arglist.max_episode_len
                save_dir_custom = arglist.save_dir + "/" + str(episode_number_name) + '/'
                # save_dir
                U.save_state(save_dir_custom, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
                # draw custom statistics picture when save the model----------------------------------------------------
                if arglist.draw_picture_train:
                    episode_number_name = train_step / arglist.max_episode_len
                    model_name = arglist.save_dir.split('/')[-1] + '/'
                    draw_util.draw_episodes(episode_number_name,
                                            arglist.pictures_dir_train + model_name + "all_episodes/",
                                            aver_cover,
                                            j_index,
                                            energy_consumptions_for_test,
                                            instantaneous_dis,
                                            instantaneous_out_the_map,
                                            energy_efficiency,
                                            instantaneous_accmulated_reward,
                                            len(aver_cover))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                break
Esempio n. 3
0
def train(arglist):
    debug = False
    multi_process = arglist.mp
    num_tasks = arglist.num_task  # 总共有多少个任务
    list_of_taskenv = []  # env list
    save_path = arglist.save_dir
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    with U.single_threaded_session():
        sess = tf.get_default_session()
        if debug:
            begin = time_begin()
        # 1.1创建每个任务的actor trainer和critic trainer
        env = make_env(arglist.scenario, reward_type=arglist.reward_type)
        env.set_map(
            sample_map(arglist.train_data_dir + arglist.train_data_name +
                       "_4.h5"))

        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        actor_0 = get_trainers(env,
                               "actor_",
                               num_adversaries,
                               obs_shape_n,
                               arglist,
                               type=0,
                               session=sess)

        # 1.2创建每个任务的actor trainer和critic trainer
        critic_list = []  # 所有任务critic的list
        actor_list = []
        for i in range(num_tasks):
            list_of_taskenv.append(
                make_env(arglist.scenario, reward_type=arglist.reward_type))
            if arglist.shared_lstm:
                critic_trainers = get_trainers(list_of_taskenv[i],
                                               "task_" + str(i + 1) + "_",
                                               num_adversaries,
                                               obs_shape_n,
                                               arglist,
                                               lstm_scope="actor_",
                                               actors=actor_0,
                                               type=1,
                                               session=sess)
            else:
                critic_trainers = get_trainers(list_of_taskenv[i],
                                               "task_" + str(i + 1) + "_",
                                               num_adversaries,
                                               obs_shape_n,
                                               arglist,
                                               actors=actor_0,
                                               type=1,
                                               session=sess)

            actor_trainers = get_trainers(list_of_taskenv[i],
                                          "task_" + str(i + 1) + "_",
                                          num_adversaries,
                                          obs_shape_n,
                                          arglist,
                                          actor_env_name="actor_",
                                          type=2,
                                          session=sess)
            actor_list.append(actor_trainers)
            critic_list.append(critic_trainers)

        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # 1.2 全局变量初始化
        episodes_rewards = [[0.0] for _ in range(num_tasks)
                            ]  # 每个元素为在一个episode中所有agents rewards的和
        # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和
        agent_rewards = [[[0.0] for _ in range(env.n)]
                         for _ in range(num_tasks)]
        final_ep_rewards = [[] for _ in range(num_tasks)
                            ]  # sum of rewards for training curve
        final_ep_ag_rewards = [[] for _ in range(num_tasks)
                               ]  # agent rewards for training curve

        energy_consumptions_for_test = [[] for _ in range(num_tasks)]
        j_index = [[] for _ in range(num_tasks)]
        aver_cover = [[] for _ in range(num_tasks)]
        instantaneous_dis = [[] for _ in range(num_tasks)]
        instantaneous_out_the_map = [[] for _ in range(num_tasks)]
        energy_efficiency = [[] for _ in range(num_tasks)]
        instantaneous_accmulated_reward = [[] for _ in range(num_tasks)]

        global_steps_tensor = tf.Variable(
            tf.zeros(num_tasks),
            trainable=False)  # global timesteps for each env
        global_steps_ph = tf.placeholder(tf.float32, [num_tasks])
        global_steps_assign_op = tf.assign(global_steps_tensor,
                                           global_steps_ph)
        model_number = int(arglist.num_train_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)

        efficiency_list = []
        for i in range(num_tasks):
            efficiency_list.append(
                tf.placeholder(tf.float32,
                               shape=None,
                               name="efficiency_placeholder" + str(i)))
        efficiency_summary_list = []
        for i in range(num_tasks):
            efficiency_summary_list.append(
                tf.summary.scalar("efficiency_%s" % i, efficiency_list[i]))
        writer = tf.summary.FileWriter("../summary/efficiency")

        # 1.3 局部变量初始化
        local_steps = np.zeros(num_tasks)  # local timesteps for each env
        energy_one_episode = [[] for _ in range(num_tasks)]
        j_index_one_episode = [[] for _ in range(num_tasks)]
        aver_cover_one_episode = [[] for _ in range(num_tasks)]
        over_map_counter = np.zeros(num_tasks)
        over_map_one_episode = [[] for _ in range(num_tasks)]
        disconnected_number_counter = np.zeros(num_tasks)
        disconnected_number_one_episode = [[] for _ in range(num_tasks)]
        episode_reward_step = np.zeros(
            num_tasks)  # 累加一个episode里每一步的所有智能体的平均reward
        accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
        route_one_episode = [[] for _ in range(num_tasks)]

        # Initialize
        U.initialize()
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(var)
        if debug:
            print(time_end(begin, "step3"))
            begin = time_begin()

        # 1.4 加载checkpoints
        if arglist.load_dir == "":
            arglist.load_dir = save_path
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        global_steps = tf.get_default_session().run(global_steps_tensor)

        # 1.5 初始化ENV
        obs_n_list = []
        for i in range(num_tasks):
            obs_n = list_of_taskenv[i].reset()
            list_of_taskenv[i].set_map(
                sample_map(arglist.train_data_dir + arglist.train_data_name +
                           "_4.h5"))
            obs_n_list.append(obs_n)

        if debug:
            print(time_end(begin, "initialize"))
            begin = time_begin()
        # 2.训练
        t_start = time.time()
        print('Starting iterations...')
        episode_start_time = time.time()
        state_dim = obs_shape_n[0][0]

        history_n = [[
            queue.Queue(arglist.history_length) for _ in range(env.n)
        ] for _ in range(num_tasks)]
        for i in range(num_tasks):
            for j in range(env.n):
                for _ in range(arglist.history_length):
                    history_n[i][j].put(obs_n_list[i][j])

        while True:
            for task_index in range(num_tasks):
                # 2.1更新环境,采集样本
                current_env = list_of_taskenv[task_index]
                # get action
                # action_n = [agent.action(obs) for agent, obs in zip(actor_0, obs_n_list[task_index])]
                action_n = [
                    agent.action(obs)
                    for agent, obs in zip(actor_0, history_n[task_index])
                ]
                # environment step
                new_obs_n, rew_n, done_n, info_n = current_env.step(action_n)
                current_critics = critic_list[task_index]
                current_actors = actor_list[task_index]
                if debug:
                    print(time_end(begin, "env.step"))
                    begin = time_begin()
                local_steps[task_index] += 1  # 更新局部计数器
                global_steps[task_index] += 1  # 更新全局计数器
                done = all(done_n)
                terminal = (local_steps[task_index] >= arglist.max_episode_len)
                # 收集experience
                for i in range(env.n):
                    current_critics[i].experience(obs_n_list[task_index][i],
                                                  action_n[i], rew_n[i],
                                                  new_obs_n[i], done_n[i],
                                                  terminal)

                # 更新obs
                obs_n_list[task_index] = new_obs_n
                for i in range(env.n):
                    history_n[task_index][i].get()
                    history_n[task_index][i].put(new_obs_n[i])
                # 更新reward
                for i, rew in enumerate(rew_n):
                    episodes_rewards[task_index][-1] += rew
                    agent_rewards[task_index][i][-1] += rew

                # 2.2,优化每一个任务的critic and acotr
                for critic in current_critics:
                    critic.preupdate()

                if multi_process:
                    jobs = []
                    # coord = tf.train.Coordinator()
                    for critic in current_critics:
                        #jobs.append(threading.Thread(target=critic.update, args=(current_critics, global_steps[task_index])))
                        critic.update(current_critics,
                                      global_steps[task_index])
                    # for j in jobs:
                    #     j.start()
                    # coord.join(jobs)

                    coord2 = tf.train.Coordinator()
                    jobs2 = []
                    for index, actor in enumerate(current_actors):
                        jobs2.append(
                            threading.Thread(target=actor.update,
                                             args=(
                                                 current_actors,
                                                 current_critics,
                                                 global_steps[task_index],
                                                 index,
                                             )))
                    for j in jobs2:
                        j.start()
                    coord2.join(jobs2)
                else:
                    for critic in current_critics:
                        critic.update(current_critics,
                                      global_steps[task_index])

                    for index, actor in enumerate(current_actors):
                        actor.update(current_actors, current_critics,
                                     global_steps[task_index], index)

                if debug:
                    print(time_end(begin, "update actor"))
                    begin = time_begin()

                # 2.4 记录和更新train信息
                # energy
                energy_one_episode[task_index].append(current_env.get_energy())
                # fair index
                j_index_one_episode[task_index].append(
                    current_env.get_jain_index())
                # coverage
                aver_cover_one_episode[task_index].append(
                    current_env.get_aver_cover())
                # over map counter
                over_map_counter[task_index] += current_env.get_over_map()
                over_map_one_episode[task_index].append(
                    over_map_counter[task_index])
                # disconnected counter
                disconnected_number_counter[task_index] += current_env.get_dis(
                )
                disconnected_number_one_episode[task_index].append(
                    disconnected_number_counter[task_index])
                # reward
                episode_reward_step[task_index] += np.mean(rew_n)
                accmulated_reward_one_episode[task_index].append(
                    episode_reward_step[task_index])
                route = current_env.get_agent_pos()
                route_one_episode[task_index].append(route)
                if debug:
                    print(time_end(begin, "others"))
                    begin = time_begin()

                episode_number = math.ceil(global_steps[task_index] /
                                           arglist.max_episode_len)
                if done or terminal:
                    model_name = save_path.split('/')[-2] + '/'
                    temp_efficiency = np.array(
                        aver_cover_one_episode[task_index]) * np.array(
                            j_index_one_episode[task_index]) / np.array(
                                energy_one_episode[task_index])
                    draw_util.draw_single_episode(
                        arglist.pictures_dir_train + model_name +
                        "single_episode_task_" + str(task_index) + "/",
                        episode_number, temp_efficiency,
                        aver_cover_one_episode[task_index],
                        j_index_one_episode[task_index],
                        energy_one_episode[task_index],
                        disconnected_number_one_episode[task_index],
                        over_map_one_episode[task_index],
                        accmulated_reward_one_episode[task_index])
                    # 记录每个episode的变量
                    energy_consumptions_for_test[task_index].append(
                        energy_one_episode[task_index][-1])  # energy
                    j_index[task_index].append(
                        j_index_one_episode[task_index][-1])  # fairness index
                    aver_cover[task_index].append(
                        aver_cover_one_episode[task_index][-1])  # coverage
                    instantaneous_dis[task_index].append(
                        disconnected_number_one_episode[task_index]
                        [-1])  # disconnected
                    instantaneous_out_the_map[task_index].append(
                        over_map_one_episode[task_index][-1])  # out of the map
                    instantaneous_accmulated_reward[task_index].append(
                        accmulated_reward_one_episode[task_index]
                        [-1])  # reward
                    energy_efficiency[task_index].append(
                        aver_cover_one_episode[task_index][-1] *
                        j_index_one_episode[task_index][-1] /
                        energy_one_episode[task_index][-1])  # efficiency

                    episode_end_time = time.time()
                    episode_time = episode_end_time - episode_start_time
                    episode_start_time = episode_end_time
                    with open(
                            arglist.pictures_dir_train + model_name + "task_" +
                            str(task_index) + '_train_info' + '.txt',
                            'a+') as f:
                        info = "Task index: %d, Episode number %d, energy consumption: %s, efficiency: %s, time: %s" % (
                            task_index, episode_number,
                            str(current_env.get_energy_origin()),
                            str(energy_efficiency[task_index][-1]),
                            str(round(episode_time, 3)))
                        f.write(info + "\n")
                    print(info)
                    # print('Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (
                    #     task_index,
                    #     episode_number,
                    #     str(current_env.get_energy_origin()),
                    #     str(energy_efficiency[task_index][-1]),
                    #     str(round(episode_time, 3))))

                    # 绘制reward曲线
                    efficiency_s = tf.get_default_session().run(
                        efficiency_summary_list[task_index],
                        feed_dict={
                            efficiency_list[task_index]:
                            energy_efficiency[task_index][-1]
                        })
                    writer.add_summary(efficiency_s,
                                       global_step=episode_number)

                    # 应该在每个重置每个episode中的局部变量--------------------------------------------
                    if task_index == num_tasks - 1:
                        energy_one_episode = [[] for _ in range(num_tasks)]
                        j_index_one_episode = [[] for _ in range(num_tasks)]
                        aver_cover_one_episode = [[] for _ in range(num_tasks)]
                        over_map_counter = np.zeros(num_tasks)
                        over_map_one_episode = [[] for _ in range(num_tasks)]
                        disconnected_number_counter = np.zeros(num_tasks)
                        disconnected_number_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        episode_reward_step = np.zeros(num_tasks)
                        accmulated_reward_one_episode = [
                            [] for _ in range(num_tasks)
                        ]
                        route_one_episode = [[] for _ in range(num_tasks)]

                    # 重置局部变量
                    obs_n_list[task_index] = current_env.reset()  # 重置env
                    current_env.set_map(
                        sample_map(arglist.train_data_dir +
                                   arglist.train_data_name + "_4.h5"))
                    local_steps[task_index] = 0  # 重置局部计数器

                    # 更新全局变量
                    episodes_rewards[task_index].append(0)  # 添加新的元素
                    for reward in agent_rewards[task_index]:
                        reward.append(0)

                # save model, display training output
                if terminal and (episode_number % arglist.save_rate == 0):
                    tf.get_default_session().run(
                        global_steps_assign_op,
                        feed_dict={global_steps_ph: global_steps})
                    save_dir_custom = os.path.join(save_path,
                                                   str(episode_number),
                                                   'model.ckpt')
                    U.save_state(save_dir_custom, saver=saver)
                    # print statement depends on whether or not there are adversaries
                    # 最新save_rate个episode的平均reward
                    save_rate_mean_reward = np.mean(
                        episodes_rewards[task_index][-arglist.save_rate:])
                    if num_adversaries == 0:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward,
                                    round(time.time() - t_start, 3)))
                    else:
                        print(
                            "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                            .format(global_steps[task_index], episode_number,
                                    save_rate_mean_reward, [
                                        np.mean(rew[-arglist.save_rate:])
                                        for rew in agent_rewards[task_index]
                                    ], round(time.time() - t_start, 3)))

                    t_start = time.time()

                    final_ep_rewards[task_index].append(save_rate_mean_reward)
                    for rew in agent_rewards[task_index]:
                        final_ep_ag_rewards[task_index].append(
                            np.mean(rew[-arglist.save_rate:]))

                    # 保存train曲线
                    if arglist.draw_picture_train:
                        # model_name = save_path.split('/')[-2] + '/'
                        draw_util.draw_episodes(
                            episode_number,
                            arglist.pictures_dir_train + model_name +
                            "all_episodes_task_" + str(task_index) + "/",
                            aver_cover[task_index], j_index[task_index],
                            energy_consumptions_for_test[task_index],
                            instantaneous_dis[task_index],
                            instantaneous_out_the_map[task_index],
                            energy_efficiency[task_index],
                            instantaneous_accmulated_reward[task_index],
                            len(aver_cover[task_index]))
                # saves final episode reward for plotting training curve later
                if episode_number > arglist.num_train_episodes:
                    mkdir(arglist.plots_dir)
                    rew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_rewards.pkl'
                    with open(rew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_rewards, fp)
                    agrew_file_name = arglist.plots_dir + arglist.exp_name + str(
                        task_index) + '_agrewards.pkl'
                    with open(agrew_file_name, 'wb') as fp:
                        pickle.dump(final_ep_ag_rewards, fp)
                        print('...Finished total of {} episodes.'.format(
                            episode_number))
            if episode_number > arglist.num_train_episodes:
                break