def train(arglist): with U.make_session(8): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [[29] for i in range(env.n)] obs_map_shape_n =[[56*86] for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env._reset() episode_step = 13000 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action #action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] action_n=[] for agent, obs in zip(trainers,obs_n): #print(obs) t=agent.action(obs) d=np.argmax(t) if d%5==4: rt=random.randint(0,20) if rt<4: swap=t[d] t[d]=t[d-rt-1] t[d-rt-1]=swap else: rt=random.randint(0,80) if rt<4: swap=t[d] t[d]=t[d//5*5+rt] t[d//5*5+rt]=swap action_n.append(t) #print(action_n) # environment step new_obs_n, rew_n, done_n, info_n = env._step(action_n) #print(rew_n) episode_step += 1 env.training_episode=episode_step done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env._reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.online_display or arglist.display: time.sleep(0.01) #if rew_n[2]>0: pdb.set_trace() env._render(close=False) print(rew_n) # if (rew_n[2]>0) or (rew_n[0]>0) or (rew_n[1]>0): # pdb.set_trace() #pdb.set_trace() if arglist.display: continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.make_session(8): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark)
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format( arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if arglist.restore or arglist.benchmark or arglist.load_dir is not None: print('Loading previous state...') # Set model file if arglist.model_file == "": arglist.model_file = arglist.exp_name print("Model File: " + arglist.load_dir + arglist.model_file) tf_util.load_state(arglist.load_dir + arglist.model_file) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log( len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format( [rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format( len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {}".format( len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards ], round(time.time() - t_start, 3))) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append( np.mean(reward[-arglist.save_rate:])) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
current_map[(my_pos[0]/0.075+3).astype(int)][(my_pos[1]/0.075+3).astype(int)]=1 tttt=np.concatenate([my_velocity] + [my_pos/6] + other_pos + other_vel) tt=np.array([0,0,0,0,0,0,0,0]) tt[my_shooting_angle]=1 bonus=np.array([0,0,0,0,0]) bonus[my_bonus_status-1]=1 ob=np.concatenate((tttt,tt,bonus)) #ob length=22 current_map=np.reshape(current_map,-1) result=np.array([ob,current_map]) return result arglist = parse_args() with U.make_session(1): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) map_world=np.copy(env.world.map_world) # Create agent trainers obs_shape_n = [[25] for i in range(env.n)] obs_map_shape_n =[[56*86] for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist) my_agent=trainers[2] print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ # Assign roles training_role = arglist.training_role[0] if arglist.training_role[0] == "defender": opponent_role = "attacker" opponent_index = 0 elif arglist.training_role[0] == "attacker": opponent_role = "defender" opponent_index = 1 else: raise Exception("training role error!") # suppress tensorflow warnings tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Training super {} against level 0 to {} opponent." .format(arglist.training_role[0], arglist.level)) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format(arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if ((arglist.restore or arglist.load_dir is not None) and arglist.level != 0) or arglist.benchmark: print('Loading previous state...') print("Level-k folder: " + arglist.load_dir) for opp_level in range(0, arglist.level + 1): opp_model_file = "level_{}_{}".format(opp_level, opponent_role) tf_util.load_state(fname=arglist.load_dir + opp_model_file, var_prefix="level_{}_{}_{}".format(opp_level, opponent_role, opponent_index)) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') # Initialize opponent selection distribution to uniform p_opponent_selection = np.ones(arglist.level + 1) / (arglist.level + 1) # Initialize evaluate_flag evaluate_flag = False evaluation_done = False # initialize worst performing level list worst_performing_levels = [] while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get opponent and training agents' indices good_update_index, opponent_select_index, opponent_select_level = get_update_indices(training_role, opponent_role, p_opponent_selection, arglist) updating_indices = [good_update_index] # Get action good_trainer = trainers[good_update_index] opp_trainer = trainers[opponent_select_index] if good_update_index > opponent_select_index: selected_trainers = [opp_trainer, good_trainer] elif good_update_index < opponent_select_index: selected_trainers = [good_trainer, opp_trainer] else: raise Exception("Trainer index selection error!") action_n = [agent.action(obs) for agent, obs in zip(selected_trainers, obs_n)] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log(len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(selected_trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format([rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue ''' # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() # print("Mean Episode Reward: {}".format([np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards])) continue ''' # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format(len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Test current super agent's performance against all other agents # Check if all level has been evaluated and update p_select if (terminal or done) and evaluate_flag and evaluation_done: evaluate_flag = False level_performances = level_performances / arglist.evaluate_length np.set_printoptions(precision=2) print("Evaluation complete, against level 0 to {} performances: {}".format(arglist.level, level_performances)) worst_level = np.argmin(level_performances) worst_performing_levels.append(worst_level) print("Worst performing level is {}".format(worst_level)) # update p_select #TODO: check some other distributions # p_opponent_selection = np.ones(arglist.level + 1) * 0.6 / arglist.level p_opponent_selection[worst_level] = 1 print("Opponent selection probability set to: {}".format(p_opponent_selection)) # Pop evaluation list and update current evaluate level if (terminal or done) and evaluate_flag: last_episode_agent_reward = agent_rewards[get_role_index(arglist.training_role[0])][-2] level_performances[evaluate_level] += last_episode_agent_reward if len(evaluate_levels) == 0: evaluation_done = True else: evaluate_level = evaluate_levels.pop(0) # get the level to evaluate next # set up p_selection distribution p_opponent_selection = np.zeros(arglist.level + 1) p_opponent_selection[evaluate_level] = 1 # set up evaluate schedules if (terminal or done) and (len(episode_rewards) % arglist.evaluate_rate == 0): print("Freezing current super-agent's network and performing evaluation.") evaluate_flag = True evaluation_done = False eval_len = arglist.evaluate_length evaluate_levels = [] level_performances = np.zeros(arglist.level + 1) for level in range(arglist.level + 1): for i in range(eval_len): evaluate_levels.append(level) evaluate_level = evaluate_levels.pop(0) # In evaluate mode, don't perform model updates if evaluate_flag: continue # If not in display or benchmark mode, update trainers with index in updating_indices. loss = None for i, agent in enumerate(trainers): if i in updating_indices: agent.preupdate() for i, agent in enumerate(trainers): if i in updating_indices: loss = agent.update(selected_trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {:.2f} s".format(len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards], round(time.time() - t_start, 3))) if arglist.level_k_select_print: print("Opponent selection probability: {}".format(p_opponent_selection)) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append(np.mean(reward[-arglist.save_rate:])) # Pickle dump trainning curve info if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) worst_level_file_name = arglist.plots_dir + arglist.exp_name + '_worst_performing_level.pkl' with open(worst_level_file_name, 'wb') as fp: pickle.dump(worst_performing_levels, fp) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format(len(episode_rewards))) print('...Worst performing history: {}'.format(worst_performing_levels)) break
def train(arglist): with U.make_session(2): # Create environment env = make_env(arglist.exp_name) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] trainers = get_trainers(env, obs_shape_n, arglist) print('Using policy {}'.format(arglist.policy)) # Initialize U.initialize() # Load previous results, if necessary # reuse or not if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.reuse: print('Loading previous state...') U.load_state(arglist.load_dir) print('Loading successfully!') # generate log path, if neccessary if not os.path.exists(arglist.plots_dir + arglist.exp_name): os.makedirs(arglist.plots_dir + arglist.exp_name) if arglist.is_train: episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # choose action in environment # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, len(episode_rewards) - 1, saver=saver) # print statement depends on whether or not there are adversaries print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( train_step, len(episode_rewards) - 1, np.mean(episode_rewards[-arglist.save_rate - 1:-1]) / arglist.max_episode_len, round(time.time() - t_start, 3))) print('save model!') t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break else: episode_rewards = [] obs_n = env.reset() print('Starting test...') for i in range(arglist.num_test_episodes): episode_step = 0 tem_rew = 0 t_start = time.time() while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # choose action in environment # environment step # do greedy chooice in test mode? ''' greedy_action_n = [[0]*len(i) for i in action_n] for i,act in enumerate(action_n): greedy_action_n[i][np.argmax(act)] = 1 ''' new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) tem_rew += sum(rew_n) if done or terminal: obs_n = env.reset() episode_rewards.append(tem_rew / episode_step) print( "episodes: {}, mean episode reward: {}, time: {}". format(i, episode_rewards[-1], round(time.time() - t_start, 3))) break
def train(arglist): with U.make_session(4): # Create environment env = make_env(arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, obs_shape_n, env.action_number, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of reward for episodes cur_cum_latency = [ 0.0 for _ in range(env.n) ] # cumulative latency of current episode for different agents cur_cum_qoe = [ 0.0 for _ in range(env.n) ] # cumulative qoe of current episode for different agents final_ep_rewards = [] # sum of rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 csv_logger = [ CSVLogger(agent.name + "_result.csv") for agent in env.agents ] t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] #print("action_n: ", action_n) # environment step new_obs_n, rew_n, done_n, info_n, qoe_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience and log for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) cur_cum_latency[i] += info_n[ i] # accumulated latency value is used for getting average latency obs_n = new_obs_n episode_rewards[-1] += rew_n[ 0] # add reward of current step, since the reward of all users is consistent, take rew_n[0] is ok for i, qoe in enumerate(qoe_n): cur_cum_qoe[i] += qoe if done or terminal: print("episode %d: " % len(episode_rewards), rew_n) # log to csv for i in range(env.n): csv_logger[i].log_dict({ "agent id": i, "episode": len(episode_rewards), "reward": episode_rewards[-1], "average latency": cur_cum_latency[i] / episode_step, "average accuracy": new_obs_n[i][4], "base station util rate": env.world.bs.cum_utilization_rate / episode_step, "qoe": cur_cum_qoe[i] }) obs_n = env.reset() cur_cum_latency = [0.0 for _ in range(env.n)] cur_cum_qoe = [0.0 for _ in range(env.n)] episode_step = 0 episode_rewards.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): for logger in csv_logger: logger.flush() U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) # TODO: # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: """ rew_file_name = os.path.join('summary', 'rewards.pkl') with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) """ # close csv file for logger in csv_logger: logger.close() break