def draw(i, path, energy, route, actions, ob_, sqrt_, r_, discon_, over_map, final_steps, Run=False): mkdir(path) label = 'epoch:' + str(FLAGS.max_epoch) + '\nUAV: ' + str(FLAGS.num_uav) + '\n map size: ' + str(FLAGS.size_map) + '\n sensing range:' + str(FLAGS.radius) \ + '\n constraint:' + str(FLAGS.constrain) Fig = plt.figure(figsize=(18, 10)) # Create a `figure' instance Ax = Fig.add_subplot(321) plt.xlabel('No. of epochs') plt.ylabel('Average attained coverage') Ax.plot(range(final_steps), ob_) # # Bx = Fig.add_subplot(322) plt.xlabel('No. of epochs') plt.ylabel('Jain\'s fairness index') Bx.plot(range(final_steps), sqrt_) # # Cx = Fig.add_subplot(323) plt.xlabel('No. of epochs') plt.ylabel('Accumulated reward') Cx.plot(range(final_steps), r_) # # Dx = Fig.add_subplot(324) plt.xlabel('No. of epochs') plt.ylabel('Accumulated times \nof disconnection') Dx.plot(range(final_steps), discon_, color='blue') Gx = Fig.add_subplot(326) plt.xlabel('No. of epochs') plt.ylabel('Accumulated times \nto fly outside the map') line_ob, = Gx.plot(range(final_steps), over_map, color='green') plt.legend([ line_ob, ], [ label, ]) Hx = Fig.add_subplot(325) plt.xlabel('No. of epochs') plt.ylabel('Energy consumption') Hx.plot(range(final_steps), energy, color='green') Fig.subplots_adjust(hspace=0.4) Fig.savefig(path + '/pic_' + str(i) + '.png') plt.close()
def draw_episodes(i, path, coverage, j_index, energy, A_discon, A_over_map, A_efficiency, A_reward, final_steps): mkdir(path) steps = final_steps plt.figure(figsize=(18, 10)) plt.subplot(4, 2, 1) plt.xlabel("No. of step") plt.ylabel("Energy efficiency") plt.plot(range(steps), A_efficiency, color='b') plt.subplot(4, 2, 2) plt.xlabel("No. of step") plt.ylabel("Fairness") plt.plot(range(steps), j_index, color='r') plt.subplot(4, 2, 3) plt.xlabel("No. of step") plt.ylabel("Coverage") plt.plot(range(steps), coverage, color='g') plt.subplot(4, 2, 4) plt.xlabel("No. of step") plt.ylabel("Energy") plt.plot(range(steps), energy, color='c') plt.subplot(4, 2, 5) plt.xlabel("No. of step") plt.ylabel("Disconnect") plt.plot(range(steps), A_discon, color='m') plt.subplot(4, 2, 6) plt.xlabel("No. of step") plt.ylabel("Over map counter") plt.plot(range(steps), A_over_map, color='y') plt.subplot(4, 2, 7) plt.xlabel("No. of step") plt.ylabel("Reward") plt.plot(range(steps), A_reward, color='k') plt.savefig(path + "episodes_" + str(i) + '.png') plt.close()
def draw_single_episode(path, episode_number, efficiency, coverage, fairness, energy, disconnect, over_map, reward): mkdir(path) steps = len(efficiency) plt.figure(figsize=(40, 20)) plt.subplot(4, 2, 1) plt.xlabel("No. of step") plt.ylabel("Energy efficiency") plt.plot(range(steps), efficiency, color='b') plt.subplot(4, 2, 2) plt.xlabel("No. of step") plt.ylabel("Coverage") plt.plot(range(steps), coverage, color='g') plt.subplot(4, 2, 3) plt.xlabel("No. of step") plt.ylabel("Fairness") plt.plot(range(steps), fairness, color='r') plt.subplot(4, 2, 4) plt.xlabel("No. of step") plt.ylabel("Energy") plt.plot(range(steps), energy, color='c') plt.subplot(4, 2, 5) plt.xlabel("No. of step") plt.ylabel("Disconnect") plt.plot(range(steps), disconnect, color='m') plt.subplot(4, 2, 6) plt.xlabel("No. of step") plt.ylabel("Over map counter") plt.plot(range(steps), over_map, color='y') plt.subplot(4, 2, 7) plt.xlabel("No. of step") plt.ylabel("Reward") plt.plot(range(steps), over_map, color='k') plt.savefig(path + "episode_" + str(episode_number) + '_info.png') plt.close()
def test(arglist): debug = False num_tasks = arglist.num_task # 总共有多少个任务 list_of_taskenv = [] # env list load_path = arglist.load_dir with U.single_threaded_session(): if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer trainers_list = [] env = make_env(arglist.scenario, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) for i in range(num_tasks): list_of_taskenv.append(make_env(arglist.scenario)) trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist) trainers_list.append(trainers) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) global_steps_tensor = tf.Variable(tf.zeros(num_tasks), trainable=False) # global timesteps for each env global_steps_ph = tf.placeholder(tf.float32, [num_tasks]) global_steps_assign_op = tf.assign(global_steps_tensor, global_steps_ph) model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) efficiency_list = [] for i in range(num_tasks): efficiency_list.append(tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder" + str(i))) efficiency_summary_list = [] for i in range(num_tasks): efficiency_summary_list.append(tf.summary.scalar("efficiency_%s" % i, efficiency_list[i])) writer = tf.summary.FileWriter("../summary/efficiency") # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) if debug: print(time_end(begin, "initialize")) begin = time_begin() model_name = arglist.load_dir.split('/')[-2] + '/' mkdir(arglist.pictures_dir_test + model_name) model_index_step = 0 model_number_total = arglist.train_num_episodes / arglist.save_rate max_model_index = 0 max_average_energy_efficiency = 0 while True: if model_index_step >= model_number_total: with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \ '\nMax average energy efficiency:' + str(max_average_energy_efficiency) file.write(report) break else: model_index_step += 1 # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/' U.load_state(arglist.load_dir) # global_steps = tf.get_default_session().run(global_steps_tensor) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() obs_n_list.append(obs_n) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks)] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks)] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks)] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] # 1.3 局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] print('Starting iterations...') while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] current_trainers = trainers_list[task_index] # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_trainers[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append(current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append(current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append(over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis() disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if done or terminal: # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append(disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) print('Episode: %d - energy_consumptions: %s ' % (train_step / arglist.max_episode_len, str(env._get_energy_origin()))) if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: if np.mean(energy_efficiency) > max_average_energy_efficiency: max_model_index = model_index_step * arglist.save_rate - 1 max_average_energy_efficiency = np.mean(energy_efficiency) with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \ '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \ '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \ '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \ '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \ '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n' file.write(report) draw_util.drawTest(model_index_step * arglist.save_rate - 1, arglist.pictures_dir_test + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch---------------------------------------- # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist, restore_model_number): debug = False multi_process = arglist.mp num_tasks = arglist.num_task_transfer # 总共有多少个任务 list_of_taskenv = [] # env list save_path = arglist.save_dir if not os.path.exists(save_path): os.makedirs(save_path) with U.single_threaded_session(): sess = tf.get_default_session() if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer env = make_env(arglist.scenario, reward_type=arglist.reward_type) env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5")) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) actor_0 = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0, session=sess) # 1.2创建每个任务的actor trainer和critic trainer critic_list = [] # 所有任务critic的list actor_list = [] print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks) ] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks) ] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks) ] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] model_number = int(arglist.num_train_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) # 1.3 局部变量初始化 global_steps = np.zeros(num_tasks) local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros( num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if debug: print(time_end(begin, "step3")) begin = time_begin() # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = os.path.join(save_path, str(restore_model_number), "model.ckpt") if arglist.transfer_restore: print('Loading previous state...') U.load_state(arglist.load_dir) for i in range(num_tasks): list_of_taskenv.append( make_env(arglist.scenario, reward_type=arglist.reward_type)) critic_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actors=actor_0, type=1, session=sess) actor_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actor_env_name="actor_", type=2, session=sess) actor_list.append(actor_trainers) critic_list.append(critic_trainers) # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5")) obs_n_list.append(obs_n) if debug: print(time_end(begin, "initialize")) begin = time_begin() # 2.训练 t_start = time.time() print('Starting iterations...') episode_start_time = time.time() state_dim = obs_shape_n[0][0] history_n = [[ queue.Queue(arglist.history_length) for _ in range(env.n) ] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(env.n): for _ in range(arglist.history_length): history_n[i][j].put(obs_n_list[i][j]) while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] # get action # action_n = [agent.action(obs) for agent, obs in zip(actor_0, obs_n_list[task_index])] action_n = [ agent.action(obs) for agent, obs in zip(actor_0, history_n[task_index]) ] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) current_critics = critic_list[task_index] current_actors = actor_list[task_index] if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_critics[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n for i in range(env.n): history_n[task_index][i].get() history_n[task_index][i].put(new_obs_n[i]) # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # 2.2,优化每一个任务的critic and acotr for critic in current_critics: critic.preupdate() for critic in current_critics: critic.update(current_critics, global_steps[task_index]) for index, actor in enumerate(current_actors): actor.update(current_actors, current_critics, global_steps[task_index], index) if debug: print(time_end(begin, "update actor")) begin = time_begin() # 2.4 记录和更新train信息 # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append( current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append( current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append( over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis( ) disconnected_number_one_episode[task_index].append( disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append( episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if debug: print(time_end(begin, "others")) begin = time_begin() episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: model_name = save_path.split('/')[-2] + '/' temp_efficiency = np.array( aver_cover_one_episode[task_index]) * np.array( j_index_one_episode[task_index]) / np.array( energy_one_episode[task_index]) draw_util.draw_single_episode( arglist.pictures_dir_transfer_train + model_name + "single_episode_task_" + str(task_index) + "/", episode_number, temp_efficiency, aver_cover_one_episode[task_index], j_index_one_episode[task_index], energy_one_episode[task_index], disconnected_number_one_episode[task_index], over_map_one_episode[task_index], accmulated_reward_one_episode[task_index]) # 记录每个episode的变量 energy_consumptions_for_test[task_index].append( energy_one_episode[task_index][-1]) # energy j_index[task_index].append( j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append( aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index] [-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index] [-1]) # reward energy_efficiency[task_index].append( aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time with open( arglist.pictures_dir_transfer_train + model_name + "task_" + str(task_index) + '_train_info' + '.txt', 'a+') as f: info = "Task index: %d, Episode number %d, energy consumption: %s, efficiency: %s, time: %s" % ( task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3))) f.write(info + "\n") print(info) # 应该在每个重置每个episode中的局部变量-------------------------------------------- if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [ [] for _ in range(num_tasks) ] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [ [] for _ in range(num_tasks) ] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5")) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) # save model, display training output if terminal and (episode_number % arglist.save_rate == 0): # tf.get_default_session().run(global_steps_assign_op, feed_dict={global_steps_ph: global_steps}) # save_dir_custom = os.path.join(save_path, str(episode_number), 'model.ckpt') # U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries # 最新save_rate个episode的平均reward save_rate_mean_reward = np.mean( episodes_rewards[task_index][-arglist.save_rate:]) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards[task_index] ], round(time.time() - t_start, 3))) t_start = time.time() final_ep_rewards[task_index].append(save_rate_mean_reward) for rew in agent_rewards[task_index]: final_ep_ag_rewards[task_index].append( np.mean(rew[-arglist.save_rate:])) # 保存train曲线 if arglist.draw_picture_train: # model_name = save_path.split('/')[-2] + '/' draw_util.draw_episodes( episode_number, arglist.pictures_dir_transfer_train + model_name + "all_episodes_task_" + str(task_index) + "/", aver_cover[task_index], j_index[task_index], energy_consumptions_for_test[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], energy_efficiency[task_index], instantaneous_accmulated_reward[task_index], len(aver_cover[task_index])) # saves final episode reward for plotting training curve later if episode_number > arglist.num_train_episodes: mkdir(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( episode_number)) if episode_number > arglist.num_train_episodes: break
def drawTest(i, path, energy_efficiency, energy, coverage, jainindex, r_, discon_, over_map, final_steps, BL_coverage, BL_jain, BL_loss, Run=False): mkdir(path) label = 'epoch:' + str(FLAGS.max_epoch) + '\nUAV: ' + str(FLAGS.num_uav) + '\n map size: ' + str(FLAGS.size_map) + '\n sensing range:' + str(FLAGS.radius) \ + '\n constraint:' + str(FLAGS.constrain) + '\n average energy efficiency:' + str(np.mean(energy_efficiency)) \ + '\n max energy efficiency:' + str(np.max(energy_efficiency)) Fig = plt.figure(figsize=(18, 10)) # Create a `figure' instance Ax = Fig.add_subplot(421) plt.xlabel('No. of episodes') plt.ylabel('Average attained coverage') Ax.plot(range(final_steps), coverage) Ax.plot([BL_coverage] * final_steps) # # Bx = Fig.add_subplot(422) plt.xlabel('No. of episodes') plt.ylabel('Jain\'s fairness index') Bx.plot(range(final_steps), jainindex) Bx.plot([BL_jain] * final_steps) # # Cx = Fig.add_subplot(423) plt.xlabel('No. of episodes') plt.ylabel('Instantaneous reward') Cx.plot(range(final_steps), r_) # # Dx = Fig.add_subplot(424) plt.xlabel('No. of episodes') plt.ylabel('Instantaneous times \nof disconnection') Dx.plot(range(final_steps), discon_, color='blue') Dx.plot([BL_loss] * final_steps) Gx = Fig.add_subplot(426) plt.xlabel('No. of episodes') plt.ylabel('Accumulated times \nto fly outside the map') line_ob, = Gx.plot(range(final_steps), over_map, color='green') plt.legend([ line_ob, ], [ label, ]) Hx = Fig.add_subplot(425) plt.xlabel('No. of episodes') plt.ylabel('Average energy consumption') Hx.plot(range(final_steps), energy, color='green') Hx = Fig.add_subplot(427) plt.xlabel('No. of episodes') plt.ylabel('Energy efficiency') Hx.plot(range(final_steps), energy_efficiency, color='magenta') Fig.subplots_adjust(hspace=0.4) Fig.savefig(path + '/pic_' + str(i) + '.png') plt.close()
def random_maddpg_test(arglist): debug = False num_tasks = arglist.num_task_transfer # 总共有多少个任务 list_of_taskenv = [] # env list graph = tf.Graph() with graph.as_default(): with U.single_threaded_session(): if debug: begin = time_begin() # 1.1创建common actor env = make_env(arglist.scenario, reward_type=arglist.reward_type) env.set_map(sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5")) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) actors = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0) for i in range(num_tasks): list_of_taskenv.append(make_env(arglist.scenario, reward_type=arglist.reward_type)) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # 1.2 Initialize U.initialize() model_name = arglist.load_dir.split('/')[-2] + '/' path = arglist.pictures_dir_transfer_test + model_name mkdir(path) for i in range(num_tasks): mkdir(os.path.join(path, "task_" + str(i))) # 2.1 加载checkpoints # model_load_dir = os.path.join(arglist.load_dir, str(model_number * arglist.save_rate), 'model.ckpt') # print('From ', model_load_dir, ' Loading previous state...') # U.load_state(model_load_dir) # 3.1 全局变量初始化 global_steps = np.zeros(num_tasks) # global timesteps for each env episodes_rewards = [[0.0] for _ in range(num_tasks)] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] # 3.2 局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 # 3.3 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5", random=False)) obs_n_list.append(obs_n) # 3.4 history_n = [[queue.Queue(arglist.history_length) for _ in range(env.n)] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(env.n): for _ in range(arglist.history_length): history_n[i][j].put(obs_n_list[i][j]) # 4 test episode_start_time = time.time() print('Starting iterations...') episode_number = 0 while True: for task_index in range(num_tasks): # 3.1更新环境 current_env = list_of_taskenv[task_index] # get action action_n = [agent.action(obs) for agent, obs in zip(actors, history_n[task_index])] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 更新obs obs_n_list[task_index] = new_obs_n # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append(current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append(current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append(over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis() disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: # 记录每个episode的变量 energy_consumptions_for_test[task_index].append(energy_one_episode[task_index][-1]) # energy j_index[task_index].append(j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append(aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index][-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index][-1]) # reward energy_efficiency[task_index].append(aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time print('Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % ( task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3)))) current_path = os.path.join(path, "task_" + str(task_index)) if arglist.draw_picture_test: file_path = os.path.join(current_path, "random_model_test.log") if episode_number == arglist.num_test_episodes: report = '\nOK===============report=====================\nRadom maddpg Model-testing ' \ + str(arglist.num_test_episodes) + ' episodes\'s result:' \ + '\n!!!Max energy efficiency: ' \ + str(np.max(energy_efficiency[task_index])) \ + '\n!!!Average energy efficiency:' \ + str(np.mean(energy_efficiency[task_index])) \ + '\nAverage average attained coverage: ' \ + str(np.mean(aver_cover[task_index])) + \ '\nAverage Jaint\'s fairness index: ' \ + str(np.mean(j_index[task_index])) + \ '\nAverage normalized average energy consumptions:' \ + str(np.mean(energy_consumptions_for_test[task_index])) \ + "\n" + "==========================end=============================\n" draw_util.drawTest("random", current_path+"random_maddpg", energy_efficiency[task_index], energy_consumptions_for_test[task_index], aver_cover[task_index], j_index[task_index], instantaneous_accmulated_reward[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], len(aver_cover[task_index]), bl_coverage, bl_jainindex, bl_loss, False) else: report = '\nRandom maddpg Model-' \ + '-episode ' + str(episode_number) + ' result:' \ + '\n!!!Energy efficiency: ' \ + str(energy_efficiency[task_index][-1]) \ + '\nAverage attained coverage: ' \ + str(aver_cover[task_index][-1]) + \ '\nJaint\'s fairness index: ' \ + str(j_index[task_index][-1]) + \ '\nnormalized average energy consumptions: ' \ + str(energy_consumptions_for_test[task_index][-1]) \ + "\n" with open(file_path, 'a+') as file: file.write(report) # reset custom statistics variabl between episode and epoch------------------------------------ if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map( arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5", random=False)) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) if episode_number > arglist.num_test_episodes: break